# CFB Ratings Data

### Retrieves game data.

In [1]:
# Imports
import pandas as pd
import numpy as np
import random
from functools import reduce
import os
os.chdir("../")
os.getcwd()

'/Users/jakesingleton/Documents/projects/football'

### Elo Ratings

In [2]:
# Dict of all 66 "power" teams and their conerences
# 66 = 64 P5 teams + BYU + ND
# Added in "Other" for 69 total teams
power_teams = {"PAC": ["Washington", "Oregon", "California", "Stanford", "Oregon State", "Washington State",
                 "Utah", "Arizona State", "Southern California", "Arizona", "Colorado", "UCLA"],
         "BIG10": ["Wisconsin", "Purdue", "Northwestern", "Nebraska", "Minnesota", "Iowa", "Illinois",
                   "Rutgers", "Penn State", "Ohio State", "Michigan", "Michigan State", "Maryland", "Indiana"],
         "SEC": ["Florida", "Georgia", "Kentucky", "Missouri", "South Carolina", "Tennessee", "Vanderbilt",
                 "Alabama", "Arkansas", "Auburn", "Louisiana State", "Mississippi State", "Mississippi", "Texas A&M"],
         "BIG12": ["Baylor", "Iowa State", "Kansas", "Kansas State", "Oklahoma", "Oklahoma State", "Texas Christian",
                   "Texas", "Texas Tech", "West Virginia"],
         "ACC": ["Boston College", "Clemson", "Florida State", "Louisville", "North Carolina State", "Syracuse",
                "Wake Forest", "Duke", "Georgia Tech", "Miami (FL)", "North Carolina", "Pittsburgh", "Virginia", 
                 "Virginia Tech"],
         "Independents": ["Brigham Young", "Notre Dame"],
         "Other": ["Cincinnati", "Houston", "Central Florida"]}

# Scrape game results
# year = 2019
# url = 'https://www.sports-reference.com/cfb/years/' + str(year) + '-schedule.html'
# dat = pd.read_html(url)[0]
# dat.head()

In [3]:
# DATA CLEANING 
# Put ranks of teams into separate col
# rank_w = dat["Winner"].str.extract(r'([0-9]+|\b)')
# rank_l = dat["Loser"].str.extract(r'([0-9]+|\b)')
# dat["Rk_W"] = np.where(rank_w == "", np.nan, rank_w)
# dat["Rk_L"] = np.where(rank_l == "", np.nan, rank_l)

# # Remove any ranking info from  Winner and Loser cols
# dat["Winner"] = dat["Winner"].str.replace(r'\([0-9]+\)\s', '')
# dat["Loser"] = dat["Loser"].str.replace(r'\([0-9]+\)\s', '')

# # Make column for home team / neutral site
# dat = dat.rename(columns = {"Unnamed: 7": "Where", "Rk": "Game #", "Pts": "Pts_W", "Pts.1": "Pts_L"})
# conditions = [dat["Where"] == "@", dat["Where"] == "N", dat["Where"].isna()]
# choices = [dat["Loser"], "Neutral", dat["Winner"]]
# dat["Home Team"] = np.select(conditions, choices)

# # Drop rows that separate weeks
# dat = dat.drop(dat[dat["Wk"] == "Wk"].index, axis = 0)

# # Make Game # column type int
# dat["Game #"] = dat["Game #"].astype('int')

# # Move Rk_W in front of Winner and Rk_l in front of Loser
# dat = dat.loc[:, ["Game #", "Wk", "Date", "Time", "Day", "Rk_W", "Winner", "Pts_W", "Where", "Rk_L", "Loser", 
#                   "Pts_L", "Notes", "Home Team"]]
# dat.head()

### Data Cleaning

In [4]:
# Loads results for given year
def load_data(year):
    url = 'https://www.sports-reference.com/cfb/years/' + str(year) + '-schedule.html'
    dat = pd.read_html(url)[0]
    return dat

# Clean data: 2019/2020 have Winner and Loser cols
def clean_data(df):
    # Columns are at indices 5 and 8
#     team_col1 = df.columns[5]
#     team_col2 = df.columns[8]
    
    # Put ranks of teams into separate col
    rank_w = df["Winner"].str.extract(r'([0-9]+|\b)')
    rank_l = df["Loser"].str.extract(r'([0-9]+|\b)')
    df["Rk_W"] = np.where(rank_w == "", np.nan, rank_w)
    df["Rk_L"] = np.where(rank_l == "", np.nan, rank_l)

    # Remove any ranking info from  Winner and Loser cols
    df["Winner"] = df["Winner"].str.replace(r'\([0-9]+\)\s', '')
    df["Loser"] = df["Loser"].str.replace(r'\([0-9]+\)\s', '')

    # Make column for home team / neutral site
    df = df.rename(columns = {"Unnamed: 7": "Where", "Rk": "Game #", "Pts": "Pts_W", "Pts.1": "Pts_L"})
    conditions = [df["Where"] == "@", df["Where"] == "N", df["Where"].isna()]
    choices = [df["Loser"], "Neutral", df["Winner"]]
    df["Home Team"] = np.select(conditions, choices)

    # Drop rows that separate weeks
    df = df.drop(df[df["Wk"] == "Wk"].index, axis = 0)

    # Make Game # column type int, Date column date, Pts columns ints, and Week column int
    df['Wk'] = df['Wk'].astype(int)
    df["Game #"] = df["Game #"].astype('int')
    df["Date"] = pd.to_datetime(df["Date"])
    df["Pts_W"] = df["Pts_W"].astype('float')
    df["Pts_L"] = df["Pts_L"].astype('float')

    # Move Rk_W in front of Winner and Rk_l in front of Loser
    df = df.loc[:, ["Game #", "Wk", "Date", "Time", "Day", "Rk_W", "Winner", "Pts_W", "Where", "Rk_L", "Loser", 
                      "Pts_L", "Notes", "Home Team"]]
    # Return clean data frame
    return df

### Elo Functionality

For now, the scoring system for team i against team j is:

$S_{ij} =    \left\{
\begin{array}{ll}
      1 & i \text{ beats } j \\
      0 & j \text{ beats } i \text{,}\\
\end{array} 
\right. $

and $\mu_{ij} = \text{ the expected number of points i will beat j by. }$

Mathematically, $\mu_{ij} = L(\frac{d_{ij}}{\xi}) = \frac{1}{1 + 10^{\frac{-d_{ij}}{\xi}}}$, where $d_{ij} = r_i(\text{old}) - r_j(\text{old})$ and $\xi$ is the logistic parameter that we choose and controls the spread of the ratings.

Then team i's new rating will be $r_i(\text{new}) = r_i(\text{old}) + K(S_{ij} - \mu_{ij})$, where $K$ is the K factor that controls the volatility of the ratings.

Alternatively, we define 

$S_{ij} = \frac{P_{ij} + 1}{P_{ij}+P_{ji}+2} $, which incorporates points for and against. This has the advantage of benefitting teams that win by large margins more than teams that do so by only small margins. For example, Notre Dame only beating FSU and Toledo each by 3 is not very impressive.

In [5]:
# Elo functionality helper functions

# Takes two teams' old ratings, logistic parameter Xi, and Home Field Advantage H 
# We consider ri_old to be the home team and rj_old to be the away team
# Generates expected score (number of points team i expects to score vs. team j) w/ logistic function, mu_ij
def get_expected_score(ri_old, rj_old, Xi, H):
    diff = H + ri_old - rj_old
    E = 1 / (1 + 10**(-diff / Xi))
    return E

# Takes a data frame (a single row) of game information and either the wins method or points method
# Gets S_ij for passing into get_new_rating(). 
def get_score(df, method='wins'):
    if method == "wins":
        S_w = 1
        S_l = 0
    elif method == "points":
        S_w = (df['Pts_W'] + 1) / (df['Pts_W'] + df['Pts_L'] + 2)
        S_l = 1 - S_w
    return S_w, S_l

# Takes an old Elo rating, K factor, Score, and Expected Score from get_expected_score() 
# Returns a team's new Elo rating
def get_new_rating(r_old, K_fac, S, Exp):
    return r_old + (K_fac * (S - Exp))

In [6]:
# Do Elo
# Input: takes df of game data, K, Xi, and H and runs Elo algorithm
# Output: returns clean data frame with elo ratings AND dictionary of teams and their final ratings
def do_elo(df, K, Xi, H, method = 'wins'):
    # Get all teams from df
    teams = pd.Series(pd.concat([df["Winner"], df["Loser"]]).unique())

    # Set initial ratings to 0
    #rating_d = {t:0 for t in teams}
    
    # New idea: start every non-P5 team with -10 Elo points, since a win/loss is worth about this much
    # Therefore the sum of ratings for P5 teams is 66 * 32 = 2112
    # So that the ratings still sum to zero, assign each non-P5 team a rating of -2112 / #{non P5 teams}
    p5_teams = [team for conf in power_teams.values() for team in conf]
    n_nonp5 = len(teams) - len(p5_teams)
    rating_d = {t:[10] if t in p5_teams else [0] for t in teams}
    ratings0 = [lst[0] for lst in list(rating_d.values())]
    
    # Make ratings df
    rating_df = pd.DataFrame(data = {"Team": list(rating_d.keys()), "Pre_Rating": ratings0})
    
    # Set up data frame with columns for ratings
    merged = (df.merge(rating_df, left_on = "Winner", right_on = "Team")
             .merge(rating_df, left_on = "Loser", right_on = "Team", suffixes = ("_W", "_L"))
             .sort_values(by = "Game #", ascending = True)
             .reset_index(drop = True))
    merged["Post_Rating_W"] = np.zeros(merged.shape[0])
    merged["Post_Rating_L"] = np.zeros(merged.shape[0])
    
    # Only consider games that have been played
    merged = merged[merged["Pts_W"].notnull()]
    
    for i in range(merged.shape[0]):
        # Get the game
        game_i = merged.iloc[i]
    
        # Get the winning and losing team names for access to their rating in the ratings dict
        team_w = game_i["Winner"]
        team_l = game_i["Loser"]
        teams = [team_w, team_l]
        
        # Decipher home and away teams
        home = game_i["Home Team"]
        #away = [teams[i] for i in range(len(teams)) if teams[i] is not home][0]
        
        # Find mu_wl and mu_lw
        if team_w == home:  # True if the winning team was home
            E_wl = get_expected_score(rating_d[team_w][-1], rating_d[team_l][-1], Xi, H)
            E_lw = get_expected_score(rating_d[team_l][-1], rating_d[team_w][-1] + H, Xi, 0)
        elif home == "Neutral":  # True if played at neutral field
            E_wl = get_expected_score(rating_d[team_w][-1], rating_d[team_l][-1], Xi, 0)
            E_lw = get_expected_score(rating_d[team_l][-1], rating_d[team_w][-1], Xi, 0)
        else:  # True if winning team was away team
            E_wl = get_expected_score(rating_d[team_w][-1], rating_d[team_l][-1] + H, Xi, 0)
            E_lw = get_expected_score(rating_d[team_l][-1], rating_d[team_w][-1], Xi, H)

        # Edit data frame of game info with old ratings and Expected scores
        merged.loc[i, "E_wl"] = E_wl
        merged.loc[i, "E_lw"] = E_lw
        merged.loc[i, "Pre_Rating_W"] = rating_d[team_w][-1]
        merged.loc[i, "Pre_Rating_L"] = rating_d[team_l][-1]
    
        # Compute their new ratings
        scores = get_score(game_i, method = method)
        S_w = scores[0]
        S_l = scores[1]
        new_rating_w = get_new_rating(rating_d[team_w][-1], K, S_w, E_wl)
        new_rating_l = get_new_rating(rating_d[team_l][-1], K, S_l, E_lw)
    
        # Store new ratings in ratings dict
        rating_d[team_w].append(new_rating_w)
        rating_d[team_l].append(new_rating_l)
    
        # Edit data frame of game info with new ratings
        merged.loc[i, "Post_Rating_W"] = new_rating_w
        merged.loc[i, "Post_Rating_L"] = new_rating_l
        
    # Clean Elo frame and return
    merged = merged.fillna("")
    return merged, rating_d

In [7]:
# Do final cleaning of data frame returned from do_elo()
def clean_elo(df):
    df = df.loc[:, ["Game #", "Wk", "Date", "Winner", "Where", "Loser", "Rk_W", "Rk_L", "Pts_W", "Pts_L",                                 
                         "Pre_Rating_W", "Pre_Rating_L", "Post_Rating_W", "Post_Rating_L", "E_wl", "E_lw"]]
    return df

In [8]:
# games_n_ratings = merged.loc[:, ["Game #", "Wk", "Date", "Winner", "Loser", "Pts_W", "Pts_L", 
#                                  "Pre_Rating_W", "Pre_Rating_L", "Post_Rating_W", "Post_Rating_L"]]
# games_n_ratings.head()
#games_n_ratings.to_csv('./data/cfb_' + str(year)+ '_elo.csv', index = False)

### Model Tuning

We need to tune the K Factor K, the logistic parameter $\xi$, and the home field advantage H. We will use the last full season of data, which was 2019

In [9]:
dat19 = load_data(2019)
dat19 = clean_data(dat19)
elo19df, elo19d = do_elo(dat19, 32, 1000, 0)
elo19df = clean_elo(elo19df)
elo19df.tail()

Unnamed: 0,Game #,Wk,Date,Winner,Where,Loser,Rk_W,Rk_L,Pts_W,Pts_L,Pre_Rating_W,Pre_Rating_L,Post_Rating_W,Post_Rating_L,E_wl,E_lw
883,884,21,2020-01-02,Tennessee,N,Indiana,,,23.0,22.0,48.771081,67.863631,65.122722,51.51199,0.489011,0.510989
884,885,21,2020-01-03,Ohio,N,Nevada,,,30.0,21.0,-1.262281,28.40032,15.283912,11.854127,0.482931,0.517069
885,886,21,2020-01-04,Tulane,N,Southern Mississippi,,,30.0,13.0,-0.896108,29.854006,15.670093,13.287805,0.482306,0.517694
886,887,21,2020-01-06,Louisiana,N,Miami (OH),,,27.0,17.0,101.214834,45.06487,116.181952,30.097753,0.532278,0.467722
887,888,22,2020-01-13,Louisiana State,N,Clemson,1.0,3.0,42.0,25.0,218.197484,216.051497,234.157954,200.091027,0.501235,0.498765


In [10]:
# Takes Elo data frame and returns MSE
def mse(df_elo):
    elodf = df_elo.copy()
    error = (1 - elodf["E_wl"])**2 + (0 - elodf["E_lw"])**2
    return np.mean(error)

In [11]:
mse(elo19df)

0.47603361551702605

In [12]:
# from sklearn.model_selection import ParameterGrid

# # Make grid of parameters
# param_grid = {"K": np.arange(10, 75, 5), "Xi": np.arange(800, 1200, 100), "H": np.arange(0, 25, 1)}
# grid = ParameterGrid(param_grid)

# # Container for errors
# errors = []

# for params in grid:
#     res = do_elo(dat19, params["K"], params["Xi"], params["H"])[0]
#     errors.append(mse(res))
#     print(len(errors))
    
# min_idx = np.argmin(errors)

In [13]:
# Optimal parameters
# grid[min_idx]

So the optimal $\xi = 800 \text{, K} = 70 \text{, and} \ H = 24$.

### Running the Elo Algorithm with Optimal Parameters

In [14]:
# Elo parameters
Xi = 800
K = 70
H = 24

In [15]:
# 2019
dat19 = load_data(2019)
dat19 = clean_data(dat19)
elo19df, elo19d = do_elo(dat19, K, Xi, H)

In [16]:
mse(elo19df)

0.44247979935115045

In [17]:
elo19df = clean_elo(elo19df)
elo19df.tail()

Unnamed: 0,Game #,Wk,Date,Winner,Where,Loser,Rk_W,Rk_L,Pts_W,Pts_L,Pre_Rating_W,Pre_Rating_L,Post_Rating_W,Post_Rating_L,E_wl,E_lw
883,884,21,2020-01-02,Tennessee,N,Indiana,,,23.0,22.0,109.799117,119.475684,145.286485,83.988316,0.493038,0.506962
884,885,21,2020-01-03,Ohio,N,Nevada,,,30.0,21.0,-5.152322,49.586243,32.599118,11.834803,0.460694,0.539306
885,886,21,2020-01-04,Tulane,N,Southern Mississippi,,,30.0,13.0,-5.960204,57.087098,32.206743,18.920151,0.454758,0.545242
886,887,21,2020-01-06,Louisiana,N,Miami (OH),,,27.0,17.0,188.554599,89.497447,218.59871,59.453336,0.570798,0.429202
887,888,22,2020-01-13,Louisiana State,N,Clemson,1.0,3.0,42.0,25.0,415.95492,405.009322,450.403647,370.560596,0.507875,0.492125


In [18]:
# Takes a dictionary of teams and Elo ratings and returns Team, Elo rating, and Rank data frame
def ranks_elo(elo_d, week):
    latest_ratings = [lst[-1] for lst in list(elo_d.values())]
    ranks = (pd.DataFrame(data = {"Team": list(elo_d.keys()), 
                                  "Rating Week " + str(week): latest_ratings})
                    .sort_values(by = "Rating Week " + str(week), ascending = False)
                    .reset_index(drop = True))
    ranks["Rank Week " + str(week)] = ranks["Rating Week " + str(week)].rank(ascending = False)
    return ranks

In [19]:
ranks19 = ranks_elo(elo19d, "Final")
ranks19[0:25]

Unnamed: 0,Team,Rating Week Final,Rank Week Final
0,Louisiana State,450.403647,1.0
1,Clemson,370.560596,2.0
2,Ohio State,349.100423,3.0
3,Appalachian State,325.262069,4.0
4,Oklahoma,302.883279,5.0
5,Oregon,302.442859,6.0
6,Georgia,295.563694,7.0
7,Memphis,289.278478,8.0
8,Notre Dame,275.796291,9.0
9,Penn State,275.467504,10.0


In [20]:
# 2020
dat20 = load_data(2020)
dat20 = clean_data(dat20)
elo20df, elo20d = do_elo(dat20, K, Xi, H)
elo20df = clean_elo(elo20df)
elo20df.tail()

Unnamed: 0,Game #,Wk,Date,Winner,Where,Loser,Rk_W,Rk_L,Pts_W,Pts_L,Pre_Rating_W,Pre_Rating_L,Post_Rating_W,Post_Rating_L,E_wl,E_lw
565,566,18,2021-01-02,Mississippi,N,Indiana,,7.0,26.0,20.0,-22.55491,166.748064,21.751012,122.442143,0.367058,0.632942
566,567,18,2021-01-02,Texas A&M,N,North Carolina,5.0,14.0,41.0,27.0,221.439476,166.215669,253.663747,133.991398,0.539653,0.460347
567,568,19,2021-01-11,Alabama,N,Ohio State,1.0,3.0,52.0,24.0,351.964332,243.626552,381.55125,214.039633,0.57733,0.42267
568,569,20,2021-02-21,Tarleton State,@,New Mexico State,,,43.0,17.0,0.0,0.0,36.208377,-36.208377,0.482737,0.517263
569,570,21,2021-03-07,New Mexico State,N,Dixie State,,,36.0,29.0,-36.208377,0.0,0.613756,-36.822133,0.47397,0.52603


In [21]:
ranks20 = ranks_elo(elo20d, "Final")    
ranks20.iloc[0:25]

Unnamed: 0,Team,Rating Week Final,Rank Week Final
0,Alabama,381.55125,1.0
1,Brigham Young,281.501529,2.0
2,Coastal Carolina,278.263268,3.0
3,Louisiana,257.450638,4.0
4,Texas A&M,253.663747,5.0
5,Liberty,246.149825,6.0
6,Cincinnati,238.766526,7.0
7,Clemson,235.55291,8.0
8,Oklahoma,225.768255,9.0
9,Notre Dame,224.013888,10.0


In [22]:
sum(ranks20["Rating Week Final"])

689.9999999999977

# 2021

### Elo with W/L

In [23]:
# 2021
# CHANGE AS NECESSARY

dat21 = load_data(2021)
dat21 = clean_data(dat21)
elo21df_w, elo21d_w = do_elo(dat21, K, Xi, H, 'wins')
elo21df_w = clean_elo(elo21df_w)
week = int(elo21df_w.tail(1)['Wk'].values[0])  
elo21df_w.tail()

Unnamed: 0,Game #,Wk,Date,Winner,Where,Loser,Rk_W,Rk_L,Pts_W,Pts_L,Pre_Rating_W,Pre_Rating_L,Post_Rating_W,Post_Rating_L,E_wl,E_lw
524,525,9,2021-10-23,Virginia,,Georgia Tech,,,48.0,40.0,107.873972,19.117151,137.243849,-10.252725,0.58043,0.41957
525,526,9,2021-10-23,Wake Forest,@,Army,16.0,,70.0,56.0,194.097137,41.32164,222.684088,12.734689,0.591615,0.408385
526,527,9,2021-10-23,West Virginia,@,Texas Christian,,,29.0,17.0,-45.258624,19.457727,-5.814188,-19.986709,0.436508,0.563492
527,528,9,2021-10-23,Western Kentucky,@,Florida International,,,34.0,19.0,-53.297001,-129.893225,-20.941175,-162.249051,0.537774,0.462226
528,529,9,2021-10-23,Wisconsin,@,Purdue,,25.0,30.0,13.0,27.019087,75.027947,65.633177,36.413857,0.44837,0.55163


In [24]:
# Create rankings
ranks21_w = ranks_elo(elo21d_w, week)
ranks21_w.iloc[0:25]

Unnamed: 0,Team,Rating Week 9,Rank Week 9
0,Oklahoma,249.112981,1.0
1,Georgia,241.19372,2.0
2,Texas-San Antonio,230.765257,3.0
3,Wake Forest,222.684088,4.0
4,Cincinnati,222.42364,5.0
5,Michigan State,220.626303,6.0
6,Michigan,219.660085,7.0
7,San Diego State,214.767514,8.0
8,Southern Methodist,205.048749,9.0
9,Alabama,190.644686,10.0


In [25]:
# Export
ratings21_w = ranks21_w.to_csv("./data/cfb_elo/week" + str(week) + "_WinLoss" + ".csv", index=False)

### Elo with Game Scores

In [26]:
# Run elo algorithm with game scores instead of wins
elo21df_p, elo21d_p = do_elo(dat21, K, Xi, H, "points")
elo21df_p = clean_elo(elo21df_p)
elo21df_p.tail()

Unnamed: 0,Game #,Wk,Date,Winner,Where,Loser,Rk_W,Rk_L,Pts_W,Pts_L,Pre_Rating_W,Pre_Rating_L,Post_Rating_W,Post_Rating_L,E_wl,E_lw
524,525,9,2021-10-23,Virginia,,Georgia Tech,,,48.0,40.0,68.247858,15.413371,67.504584,16.156644,0.555063,0.444937
525,526,9,2021-10-23,Wake Forest,@,Army,16.0,,70.0,56.0,64.212069,24.554643,67.251678,21.515035,0.511265,0.488735
526,527,9,2021-10-23,West Virginia,@,Texas Christian,,,29.0,17.0,24.407251,31.468015,34.720711,21.154555,0.477665,0.522335
527,528,9,2021-10-23,Western Kentucky,@,Florida International,,,34.0,19.0,19.128193,-20.674715,27.877807,-29.424329,0.511369,0.488631
528,529,9,2021-10-23,Wisconsin,@,Purdue,,25.0,30.0,13.0,32.748451,50.740634,48.083209,35.405877,0.469821,0.530179


In [27]:
# Create rankings
ranks21_p = ranks_elo(elo21d_p, week)    
ranks21_p[0:25]

Unnamed: 0,Team,Rating Week 9,Rank Week 9
0,Georgia,151.84924,1.0
1,Ohio State,106.563445,2.0
2,Alabama,104.251415,3.0
3,Cincinnati,100.469084,4.0
4,Michigan,99.305739,5.0
5,Pittsburgh,96.950035,6.0
6,Texas-San Antonio,88.75498,7.0
7,Coastal Carolina,87.239452,8.0
8,Houston,79.308639,9.0
9,Texas A&M,76.242908,10.0


In [28]:
# Export
ratings21_p = ranks21_p.to_csv("./data/cfb_elo/week" + str(week) + "_point_diff" + ".csv", index=False)

### Miscellaneous

In [29]:
# Rank the power teams with the W/L rankings
p5_teams = [team for conf in power_teams.values() for team in conf]
ratings21p5_w = ranks21_w[ranks21_w['Team'].isin(p5_teams)]
ratings21p5_w

Unnamed: 0,Team,Rating Week 9,Rank Week 9
0,Oklahoma,249.112981,1.0
1,Georgia,241.193720,2.0
3,Wake Forest,222.684088,4.0
4,Cincinnati,222.423640,5.0
5,Michigan State,220.626303,6.0
...,...,...,...
205,California,-78.961995,206.0
208,Colorado,-96.876037,209.0
210,Vanderbilt,-118.895568,211.0
214,Kansas,-133.644680,215.0


In [30]:
# Export
ratings21p5_w.to_csv('./data/cfb_elo/power5_week' + str(week) + "_WinLoss" + '.csv', index=False)

In [31]:
# Rank the power teams with the point differential rankings
ratings21p5_p = ranks21_p[ranks21_p['Team'].isin(p5_teams)]
ratings21p5_p

Unnamed: 0,Team,Rating Week 9,Rank Week 9
0,Georgia,151.849240,1.0
1,Ohio State,106.563445,2.0
2,Alabama,104.251415,3.0
3,Cincinnati,100.469084,4.0
4,Michigan,99.305739,5.0
...,...,...,...
182,Colorado,-29.346242,183.0
210,Indiana,-47.955048,211.0
219,Kansas,-66.001700,220.0
220,Arizona,-67.951810,221.0


In [32]:
# Export
ratings21p5_p.to_csv('./data/cfb_elo/power5_week' + str(week) + "_point_diff" + '.csv', index=False)

In [33]:
# Compare the win rankings with the points rankings
comp = ranks21_w.merge(ranks21_p, on = 'Team', suffixes = ("_wins", "_points"))
rank_cols = comp[comp.columns[pd.Series(comp.columns).str.startswith('Rank')]]
comp['Diff'] = abs(rank_cols.iloc[:, 0] - rank_cols.iloc[:, 1])
comp = comp[comp["Team"].isin(p5_teams)].sort_values(by = "Diff", ascending = False)
comp

Unnamed: 0,Team,Rating Week 9_wins,Rank Week 9_wins,Rating Week 9_points,Rank Week 9_points,Diff
186,Nebraska,-40.439151,187.0,65.777055,22.0,165.0
205,California,-78.961995,206.0,10.637733,84.0,122.0
151,Virginia Tech,-33.506473,152.0,11.707672,79.0,73.0
123,Duke,-31.272721,124.0,-27.016578,179.0,55.0
97,West Virginia,-5.814188,98.0,34.720711,51.0,47.0
...,...,...,...,...,...,...
4,Cincinnati,222.423640,5.0,100.469084,4.0,1.0
42,UCLA,62.954634,43.0,41.253435,44.0,1.0
45,Utah,52.911548,46.0,41.134558,45.0,1.0
174,Illinois,-35.250523,175.0,-26.327535,175.0,0.0


### Aggregating Our Ratings

Now we are going to merge our Elo win/loss ratings with our Elo point differential ratings. I will use a similar system to what I use in my NFL ratings, namely using these two existing rating sets to generate new, "fake" game scores that will form the basis for the final rating set. To form these final ratings, I will run another Elo algorithm on these "fake" game scores. Since, in CFB, there are often games where good teams run up the score against bad teams, which thereby reward them for playing weak schedules, I will use the Elo win/loss algorithm. This can be thought of as the "dictator" rating algorithm. 

**Note:** these game scores are generated from each rating list. In particular for each rating list of $n$ teams, the number of *simulated games* we form is ${n \choose 2}$. Hence, since I have 2 Elo rating lists at the moment, there are $2{n \choose 2}$ total simulated games. For each game, we define the winner as the one with the lower ranking (remember a lower ranking, not rating, is better) in its respective list. Finally, we run the Elo algorithm again (the win/loss version) on these simulated games to form the final, aggregated list.

In [34]:
from itertools import combinations

# get_sim_game_dat() is a helper function to be used in merge_sim_games()
# Inputs: a rating list and losing score and generates simulated game data
# Output: appropriate simulated game data for a single rating list
def get_sim_game_dat(rating_lst, losing_score):
    # Get team combinations for simulated game data matchups
    team_combos = list(combinations(rating_lst["Team"], 2))
    team1 = [matchup[0] for matchup in team_combos]
    team2 = [matchup[1] for matchup in team_combos]
    
    # Completed game data 
#     cg_dat21 = dat21[(~dat21['Pts_W'].isna()) & (~dat21['Pts_L'].isna())]
    
#     merged = (cg_dat21.merge(rating_lst, left_on = "Winner", right_on = "Team")
#                       .merge(rating_lst, left_on = "Loser", right_on = "Team", suffixes = ("_W", "_L"))
#                       .drop(columns = ["Rating_W", "Rating_L"]))
#     lst_sim_game_dat = merged.copy()
    
    # Note that since the argument rating_lst is already sorted by rating, team1 will always be the higher rated
    # team and therefore a winner on a neutral field
    lst_sim_game_dat = (pd.DataFrame(data = {"Winner": team1, "Loser": team2})
                        .merge(rating_lst, left_on = "Winner", right_on = "Team")
                        .merge(rating_lst, left_on = "Loser", right_on = "Team", suffixes = ("_W", "_L"))
                        .drop(columns = ["Team_W", "Team_L"]))
    rank_cols = lst_sim_game_dat[lst_sim_game_dat.columns[pd.Series(lst_sim_game_dat.columns).str.startswith('Rank')]]
    lst_sim_game_dat["Pts_W"] = losing_score + abs(abs(rank_cols.iloc[:, 0] - rank_cols.iloc[:, 1]))
    
#     (np.where(lst_sim_game_dat["Rank_W"] < lst_sim_game_dat["Rank_L"],
#                                                 losing_score + 
#                                                 np.floor(lst_sim_game_dat["Rank_L"] - 
#                                                         lst_sim_game_dat["Rank_W"]),
#                                                 losing_score))
    lst_sim_game_dat["Pts_L"] = losing_score
#     (np.where(lst_sim_game_dat["Rank_L"] < lst_sim_game_dat["Rank_W"],
#                                                 losing_score + 
#                                                 np.floor(lst_sim_game_dat["Rank_W"] - 
#                                                         lst_sim_game_dat["Rank_L"]),
#                                                 losing_score))
    # Arbitrary
    lst_sim_game_dat["Home Team"] = lst_sim_game_dat["Winner"]
    return lst_sim_game_dat

# Returns merged simulated game data to be used by a combiner method (Elo in this case)
def merge_sim_games(df, lsts):
    # Average losing score for simulated game data
    avg_losing_score = np.floor(np.mean(df['Pts_L']))
    # Get simulated game data
    wins_sim_dat = get_sim_game_dat(lsts[0], avg_losing_score)
    points_sim_dat = get_sim_game_dat(lsts[1], avg_losing_score)
    # Store each data frame of simulated game data
    sim_game_data = [wins_sim_dat, points_sim_dat]
    # Concat simulated game data into one data frame of simulated game data
    sim_game_data = pd.concat(sim_game_data, axis = 0, sort = False)
    # Make look like standard data frame of just game information
    sim_game_data['Game #'] = sim_game_data.index + 1
    # Return simulated game data
    return sim_game_data
    
    
# aggregate_lists() aggregates rating/ranking lists into one combind superior list using a combiner method
# Inputs: sim_games data frame of simulated game data, and elo_method (either wins or points)
# Outputs: Aggregated rating list
def aggregate_lists(sim_games, elo_method):    
    # Run the Elo algorithm (specified by the elo_method argument) on the simulated game data
    # H = 0 since there is no Home Field Advantage for our simulated data 
    elo21df_agg, elo21d_agg = do_elo(sim_games, K, Xi, 0, elo_method)
    elo21df_agg = clean_elo(elo21df_agg)
    # Return aggregated results
    return elo21df_agg, elo21d_agg

In [None]:
# Aggregate with wins!
sim_games_w = merge_sim_games(dat21, [ranks21_w, ranks21_p])
elo21df_agg_w, elo21d_agg_w = aggregate_lists(sim_games_w, 'wins')

In [None]:
elo21df_agg_w.head()

In [None]:
# Create rankings
ranks21_agg_w = ranks_elo(elo21d_agg_w, week)    
ranks21_agg_w[0:25]

In [None]:
# Export
ratings21_agg_w = ranks21_agg_w.to_csv("./data/cfb_elo/week" + str(week) + "_Agg_WinLoss" + ".csv", index=False)

In [None]:
# Aggregate with points!
#sim_games_p = merge_sim_games(dat21, [ranks21_w, ranks21_p])
#elo21df_agg_p, elo21d_agg_p = aggregate_lists(sim_games_p, 'points')

In [None]:
# Create rankings
# ranks21_agg_p = ranks_elo(elo21d_agg_p)    
# ranks21_agg_p[0:25]

In [None]:
# Compare top 25s
#ranks21_agg_w[0:25].merge(ranks21_agg_p[0:25], on="Rank", suffixes = ("_W", "_P"))

## TO-DO:

In [None]:
# We are going to iteratively append each team's aggregated rating for each week in this dict
# This code chunk only needs to be uncommented and run if I miss a week
# rating_lst = []

# for i in range(1, week + 1):
#     print(i)
#     # Filter for specific week
#     #dat_of_interest = test_dat[test_dat['Wk'] <= i]
#     dat_of_interest = dat21.copy()
#     dat_of_interest['Pts_W'] = np.where(dat_of_interest["Wk"] > i, np.nan, dat_of_interest["Pts_W"])
#     dat_of_interest['Pts_L'] = np.where(dat_of_interest["Wk"] > i, np.nan, dat_of_interest["Pts_L"])
    
#     # Run win/loss Elo
#     elo21df_w_oi, elo21d_w_oi = do_elo(dat_of_interest, K, Xi, H, 'wins')
#     elo21df_w_oi = clean_elo(elo21df_w_oi)
    
#     # Run point differential Elo
#     elo21df_p_oi, elo21d_p_oi = do_elo(dat_of_interest, K, Xi, H, "points")
#     elo21df_p_oi = clean_elo(elo21df_p_oi)
    
#     # Rank
#     ranks21_w_oi = ranks_elo(elo21d_w_oi, i)
#     ranks21_p_oi = ranks_elo(elo21d_p_oi, i)
    
#     # Aggregate
#     sim_games_w_oi = merge_sim_games(dat_of_interest, [ranks21_w_oi, ranks21_p_oi])
#     elo21df_agg_w_oi, elo21d_agg_w_oi = aggregate_lists(sim_games_w_oi, 'wins')
#     rank_df = ranks_elo(elo21d_agg_w_oi, i)
#     rating_lst.append(rank_df)

# # wow_df is the result of joining these frames for each week together
# wow_df = reduce(lambda x, y: pd.merge(x, y, on = 'Team', how = "outer"), rating_lst)

In [None]:
# wow_df is DONE for weeks 1-8, so we can store it and just add to it.
# We will save it to a data frame wow_dat and just outer join the new week to it

# Add in Week 0 rating 
# teams = list(wow_df["Team"].unique())
# pwr_teams = [team for conf in power_teams.values() for team in conf]
# #n_nonp5 = len(teams) - len(pwr_teams)
# wow_df["Rating Week 0"] = np.where(wow_df['Team'].isin(pwr_teams), 10, 0)
# first_column = wow_df.pop('Rating Week 0')
# wow_df.insert(1, 'Rating Week 0', first_column)

# Read in the data
wow_df = pd.read_csv('./data/wow_df.csv').iloc[:, 1:]

# Do the outer join
#wow_df = wow_df.merge(ranks21_agg_w, on = "Team", how = "outer").drop(columns = list(wow_df.filter(regex = '_')))

# Write to csv for storage
#wow_df.to_csv("./data/wow_df.csv")

In [None]:
sum(wow_df["Rating Week 9"])

In [None]:
wow_df.head()

This is correct... we should have 69 teams starting with 10 so the sum should be 690!

### Visualization

In [None]:
from datetime import datetime, timedelta

In [None]:
# Get a data frame for each conference's Elos over time

# Fill in NaN's with 0 (since NaN indicates they didn't play this week)
wow_df = wow_df.fillna(0)
wow_df.head()

# Container 
df_lst = []

# Min date of the season - 1 (so we can start the Elo graphs at zero)
min_date = min(elo21df_w["Date"]) - timedelta(days=1)

for conf in power_teams.keys():
    # Conference teams
    conf_teams = power_teams[conf]
    # Get conference data frame
    conf_df = wow_df[(wow_df["Team"].isin(conf_teams))]
    # Drop Rank columns
    conf_df = conf_df.drop(columns = conf_df.columns[conf_df.columns.str.startswith("Rank")])
    # Add conference tag
    conf_df["Conference"] = conf
    
    
    
#     # Make containers
#     stacked_teams = []
#     stacked_ratings = []
#     stacked_dates = []
    
#     # Get Elo over time for each team in the conference
#     for team in conf_teams:
#         sub_frame = conf_df[(conf_df["Winner"] == team) | (conf_df["Loser"] == team)].reset_index(drop = True)
#         ratings = np.where(sub_frame["Winner"] == team, sub_frame["Post_Rating_W"], sub_frame["Post_Rating_L"])
#         if sub_frame.loc[0, "Winner"] == team:
#             rating0 = sub_frame.loc[0, "Pre_Rating_W"]
#         else:
#             rating0 = sub_frame.loc[0, "Pre_Rating_L"]
#         ratings = list(np.insert(ratings, 0, rating0))
#         stacked_teams += ([team] * len(ratings))
#         stacked_ratings += ratings
#         dates = [min_date] + list(sub_frame["Date"])
#         stacked_dates += dates
#     clean_df = pd.DataFrame({"Team": stacked_teams, "Date": stacked_dates, 
#                              "Elo": stacked_ratings, "Conference": np.repeat(conf, len(stacked_ratings))})
    df_lst.append(conf_df)

In [None]:
power_team_df = pd.concat(df_lst, axis = 0)
power_team_df.head()

In [None]:
long_form = pd.melt(power_team_df, id_vars=["Team", "Conference"], var_name = "Week", value_name = "Elo")
long_form['Week'] = long_form['Week'].str.extract(r'([0-9])')
long_form.head()

In [None]:
long_form.to_csv("./data/wow_ratings.csv")

In [None]:
import plotly.express as px

# Takes data frame of Teams, WoW Ratings, and plots
# Generates plot for given conference
def plot_conf(df, year, title):
    fig = px.line(data_frame = df, x = "Week", y = "Elo", color = 'Team', title = title)
    fig.show()
    fig.write_image("images/CFB/" + title + "week" + str(week) + ".png")

In [None]:
# SEC plot
sec = long_form[long_form['Conference'] == "SEC"]
plot_conf(sec, 2021, "2021 SEC Elo")

In [None]:
# pac plot
pac = long_form[long_form['Conference'] == "PAC"]
plot_conf(pac, 2021, "2021 PAC-12 Elo")

In [None]:
# ACC plot
acc = long_form[long_form['Conference'] == "ACC"]
plot_conf(acc, 2021, "2021 ACC Elo")

In [None]:
# BIG12 plot
big12 = long_form[long_form['Conference'] == "BIG12"]
plot_conf(big12, 2021, "2021 BIG 12 Elo")

In [None]:
# BIG10 plot
big10 = long_form[long_form['Conference'] == "BIG10"]
plot_conf(big10, 2021, "2021 BIG 10 Elo")

In [None]:
# Independents plot
ind = long_form[long_form['Conference'] == "Independents"]
plot_conf(ind, 2021, "2021 Independents Elo")

In [None]:
# Other plot
oth = long_form[long_form['Conference'] == "Other"]
plot_conf(oth, 2021, "2021 Other Elo")

In [None]:
# Top 10 plot
top10teams = ranks21_agg_w["Team"][0:10]
top10 = long_form[long_form['Team'].isin(top10teams)]
plot_conf(top10, 2021, "2021 Top 10 Elo")

One issue I'm seeing is that teams like Appalachian State, Liberty, Louisiana, etc. (non-P5 teams) are rated too high. This needs to be remedied. My first idea is to make it so that P5 teams (plus BYU and ND) start off with a rating > 0, while non-P5 teams will start with a rating < 0. I will still control the ratings such that $\sum_{i=1}^{m}r_i(0) = 0$ so that the average rating remains 0 for all points in time.

In [None]:
np.mean(elo21df_agg_w['Post_Rating_W'] - elo21df_agg_w['Pre_Rating_W'])

In [None]:
sum(wow_df["Rating Week 6"])

In [None]:
sum(wow_df["Rating Week 2"])

Great! This makes sense since we are assigning the 69 "power teams" (the 64 power 5 + ND, BYU, Cinci, Houston, and UCF) an initial rating of 10 and every other team an initial rating of 0. Thus constant sums is satisfied.

In [None]:
ranks21_agg_w[ranks21_agg_w["Team"] == "California"]

### Rating Evaluation

Use WoW ratings to determine accuracy. If Team A's aggregated rating was bigger than Team B's, predict Team A to beat Team B.