# 2021 NFL Team Ratings

In [1]:
import pandas as pd
import numpy as np
import random
from functools import reduce
import os
os.getcwd()

'/Users/jakesingleton/Documents/projects/football/code'

We will implement the Massey and Colley methods.

### 0. Data

In [2]:
# Loads data from nfl_game_data.ipynb to a data frame
def load_data(year):
    return pd.read_csv("../data/" + str(year) + "_nfl_game_data.csv")

### 1. Massey

In [3]:
# Implement Massey method

# get_massey() takes a data frame df, runs the Massey algorithm, and returns the Massey Matrix M and the results
# REQUIRED: columns called "Home Team", "Away Team", "Home Score", "Away Score", and "Margin"
# (Margin is defined by Home Score - Away Score)
def get_massey(df):
    # Build M
    # Get games played for each team
    diag = df['Home Team'].value_counts().append(df["Away Team"].value_counts())
    diag = diag.groupby(diag.index).sum()
    M = pd.DataFrame(np.diag(diag), index = diag.index, columns = diag.index)
    for team in M.index:
        sub_df = df[(df["Home Team"] == team) | (df["Away Team"] == team)].reset_index()
        for i in range(sub_df.shape[0]):
            if sub_df.at[i, "Home Team"] == team:
                opp = sub_df.at[i, "Away Team"]
            else:
                opp = sub_df.at[i, "Home Team"]
            M.at[team, opp] = M.at[team, opp] - 1
    #print(M)
    # Build point differential vector p
    df["Away Diff"] = df["Away Score"] - df["Home Score"]
    diff_home = df.groupby("Home Team").agg({"Margin": "sum"})
    diff_away = df.groupby("Away Team").agg({"Away Diff": "sum"})
    merged = diff_home.merge(diff_away, left_index = True, right_index = True, how = "outer").fillna(0)
    merged["diff"] = merged["Margin"] + merged["Away Diff"]
    #print(merged)
    p = np.array(merged["diff"])
    #print(p)
    # Get M-bar and p-bar 
    Mbar = M.copy()
    pbar = p.copy()
    Mbar.iloc[Mbar.shape[0] - 1] = 1
    pbar[-1] = 0
    #print(Mbar, pbar)
    # Ratings vector r
    r = np.linalg.solve(Mbar, pbar)
    # Results data frame
    ratings = pd.DataFrame({"Team": Mbar.index, "Rating": r})
    # Now obtain offensive and defensive rating vectors through clever algebra
    # T is square diagonal matrix holding number of games played
    T = np.identity(Mbar.shape[0])
    np.fill_diagonal(T, np.diag(M))
    #print(T)
    # P is off-diagonal matrix holding number of games played between two teams
    P = np.array(M.copy())
    np.fill_diagonal(P, 0)
    P = P * -1
    #print(P)
    # Points For vector f
    f = (df.groupby("Home Team").agg({"Home Score": "sum"})["Home Score"]
         .append(df.groupby("Away Team").agg({"Away Score": "sum"})["Away Score"]))
    f = f.groupby(f.index).sum()
    #print(f)
    # Get defensive rating vector d
    d = np.linalg.solve(T + P, T@r - f)
    ratings["D Rating"] = d
    # Get offensive rating vector o
    o = r - d
    ratings["O Rating"] = o
    ratings = ratings[["Team", "Rating", "O Rating", "D Rating"]]
    ratings = ratings.sort_values(by = "Rating", ascending = False)
    ratings["Rank"] = ratings["Rating"].rank(ascending = False)
    # Return Massey Matrix M and ratings data frame 
    return M, ratings.reset_index(drop = True)

In [4]:
teams = ["Duke", "Miami", "UNC", "UVA", "VT"]
examp = pd.DataFrame(data = {"Home Team": ["Duke", "Duke", "Duke", "Duke", "Miami", "Miami", "Miami", "UNC", "UNC", "UVA"],
                             "Away Team": ["Miami", "UNC", "UVA", "VT", "UNC", "UVA", "VT", "UVA", "VT", "VT"],
                             "Home Score": [7, 21, 7, 0, 34, 25, 27, 7, 3, 14],
                             "Away Score": [52, 24, 38, 45, 16, 17, 7, 5, 30, 52]})
examp["Margin"] = examp["Home Score"] - examp["Away Score"]
examp

Unnamed: 0,Home Team,Away Team,Home Score,Away Score,Margin
0,Duke,Miami,7,52,-45
1,Duke,UNC,21,24,-3
2,Duke,UVA,7,38,-31
3,Duke,VT,0,45,-45
4,Miami,UNC,34,16,18
5,Miami,UVA,25,17,8
6,Miami,VT,27,7,20
7,UNC,UVA,7,5,2
8,UNC,VT,3,30,-27
9,UVA,VT,14,52,-38


In [5]:
# Export examp 
examp.to_csv("../data/examp_data.csv", index = False)

In [6]:
M_examp, ratings_examp = get_massey(examp)
ratings_examp

Unnamed: 0,Team,Rating,O Rating,D Rating,Rank
0,Miami,18.2,21.975,-3.775,1.0
1,VT,18.0,20.708333,-2.708333,2.0
2,UVA,-3.4,7.841667,-11.241667,3.0
3,UNC,-8.0,1.375,-9.375,4.0
4,Duke,-24.8,1.975,-26.775,5.0


This is a nice example to verify with the book! Exactly right.

### 2. Colley 

In [7]:
# Implement Colley method

# get_colley() takes a data frame of df and the Massey Matrix M, runs the Colley algorithm, and returns the Colley Matrix C and the results
# REQUIRED: data frame like one for get_massey() and the outputs from get_massey()
def get_colley(df, M):
    # We can use the identity on Who's #1? page 24 to easily get the Colley Matrix C
    C = 2 * np.identity(M.shape[0]) + M
    # Get Win/Loss data
    df["Winner"] = np.where(df["Margin"] > 0, df["Home Team"], df["Away Team"])
    df["Loser"] = np.where(df["Margin"] > 0, df["Away Team"], df["Home Team"])
    wins = df.groupby("Winner").count()["Loser"]
    losses = df.groupby("Loser").count()["Winner"]
    teams = sorted(set(df[["Winner", "Loser"]].values.flatten()))
    w_l_dict = {}
    for team in teams:
        if team not in losses.index:
            w_l_dict[team] = wins[team] - 0
        elif team not in wins.index:
            w_l_dict[team] = 0 - losses[team]
        else:
            w_l_dict[team] = wins[team] - losses[team]
    # Get vector b for Colley algorithm
    w_l_diff = np.fromiter(w_l_dict.values(), dtype = float)
    b = 1 + 0.5 * w_l_diff
    # Results data frame
    r = np.linalg.solve(C, b)
    ratings = pd.DataFrame({"Team": teams, "Rating": r}).sort_values(by = "Rating", ascending = False)
    ratings["Rank"] = ratings["Rating"].rank(ascending = False)
    # Return Colley Matrix C and ratings data frame 
    return C, ratings.reset_index(drop = True)

In [8]:
C_examp, examp_colley_ratings = get_colley(examp, M_examp)
examp_colley_ratings

Unnamed: 0,Team,Rating,Rank
0,Miami,0.785714,1.0
1,VT,0.642857,2.0
2,UNC,0.5,3.0
3,UVA,0.357143,4.0
4,Duke,0.214286,5.0


Perfect! Matches the book

### Rank Aggregation: Simulated Game Data

For each set of rankings, we can generate ${32 \choose 2} = 496$ pairwise matchups. For our Massey rankings, we will compute point differentials for these hypothetical matchups using the Massey ratings themselves. This makes sense since this is the underlying assumption of the Massey algorithm. For our Colley rankings, we will compute point differentials by the difference in `Rank` position. Then we use a combiner method on this simulated game data to create one set of ratings.

**Note that the `Home Team`, `Away Team`, etc. columns in the simulated game data are meaningless. They are there so that our algorithm will run and will not affect results.**

Once we have our simulated game data we make a subjective choice. In particular, we must apply a combiner method, which is just the name for a rating system. For now, I will re-apply Massey. Why? Because in the NFL, due to a relatively level playing field, point differential should better reflect team strength than wins and losses themselves. For CFB, I will probably employ Colley at this step, since you often see teams like Alabama brutally blow out the poor Citadel.

In [9]:
from itertools import combinations

# get_sim_game_dat() is a helper function
# Inputs: a rating list, rating method of choice, and losing score and generates simulated game data
# Output: appropriate simulated game data to be used by a combiner method
def get_sim_game_dat(rating_lst, method, losing_score):
    # Get team combinations for simulated game data matchups
    team_combos = list(combinations(rating_lst["Team"], 2))
    team1 = [matchup[0] for matchup in team_combos]
    team2 = [matchup[1] for matchup in team_combos]
    lst_sim_game_dat = (pd.DataFrame(data = {"Home Team": team1, "Away Team": team2})
                        .merge(rating_lst, left_on = "Home Team", right_on = "Team")
                        .merge(rating_lst, left_on = "Away Team", right_on = "Team", suffixes = ("_Home", "_Away"))
                        .drop(columns = ["Team_Home", "Team_Away"]))
    if method == "Massey":
        lst_sim_game_dat["Home Score"] = (np.where(lst_sim_game_dat["Rating_Home"] > lst_sim_game_dat["Rating_Away"],
                                                     losing_score + 
                                                     np.floor(lst_sim_game_dat["Rating_Home"] - 
                                                              lst_sim_game_dat["Rating_Away"]),
                                                     losing_score))
        lst_sim_game_dat["Away Score"] = (np.where(lst_sim_game_dat["Rating_Away"] > lst_sim_game_dat["Rating_Home"],
                                                     losing_score + 
                                                     np.floor(lst_sim_game_dat["Rating_Away"] - 
                                                              lst_sim_game_dat["Rating_Home"]),
                                                     losing_score))
    elif method == "Colley":
        lst_sim_game_dat["Home Score"] = (np.where(lst_sim_game_dat["Rank_Home"] < lst_sim_game_dat["Rank_Away"],
                                                     losing_score + 
                                                     np.floor(lst_sim_game_dat["Rank_Away"] - 
                                                              lst_sim_game_dat["Rank_Home"]),
                                                     losing_score))
        lst_sim_game_dat["Away Score"] = (np.where(lst_sim_game_dat["Rank_Away"] < lst_sim_game_dat["Rank_Home"],
                                                     losing_score + 
                                                     np.floor(lst_sim_game_dat["Rank_Home"] - 
                                                              lst_sim_game_dat["Rank_Away"]),
                                                     losing_score))
    else:
        pass
    lst_sim_game_dat["Margin"] = lst_sim_game_dat["Home Score"] - lst_sim_game_dat["Away Score"]
    return lst_sim_game_dat

In [10]:
# aggregate_lists() aggregates rating/ranking lists into one combind superior list
# Inputs: df of standard game data, lsts of rating/ranking lists, combiner_method (rating algorithm of our choice)
# Output: one superior rating/ranking list
def aggregate_lists(df, lsts, combiner_method):
    # Average losing score for simulated game data
    losing_scores = np.where(df["Margin"] > 0, df["Away Score"], df["Home Score"])
    avg_losing_score = np.floor(np.mean(losing_scores))
    # Get simulated game data
    massey_sim_dat = get_sim_game_dat(lsts[0], "Massey", avg_losing_score)
    colley_sim_dat = get_sim_game_dat(lsts[1], "Colley", avg_losing_score)
    # Store each data frame of simulated game data
    sim_game_data = [massey_sim_dat, colley_sim_dat]
    # Concat simulated game data into one data frame of simulated game data
    sim_game_data = pd.concat(sim_game_data, axis = 0, sort = False)
    # Make look like standard data frame of just game information
    sim_game_data = sim_game_data.loc[:, ["Home Team", "Away Team", "Home Score", "Away Score", "Margin"]]
    # Run the given algorithm on the simulated game data. This is known as the "combiner method"
    if combiner_method == get_colley:  # If the combiner method is Colley we need Massey Matrix M
        Massey_matrix = get_massey(sim_game_data)[0]
        res = combiner_method(sim_game_data, Massey_matrix)
        mat = res[0]
        ratings = res[1]
        # Arrange ratings properly and add Rank column
        ratings = ratings.sort_values(by = "Rating", ascending = False)
        ratings["Rank"] = ratings["Rating"].rank(ascending = False)
        return mat, ratings.reset_index(drop = True)
    else:
        res = combiner_method(sim_game_data)
        if len(res) > 1:  # If true then this is the Massey Method and so we need the matrix
            mat = res[0]
            ratings = res[1]
            # Arrange ratings properly and add Rank column
            ratings = ratings.sort_values(by = "Rating", ascending = False)
            ratings["Rank"] = ratings["Rating"].rank(ascending = False)
            return mat, ratings.reset_index(drop = True)
        else:
            # Arrange ratings properly and add Rank column
            ratings = ratings.sort_values(by = "Rating", ascending = False)
            ratings["Rank"] = ratings["Rating"].rank(ascending = False)
            return ratings.reset_index(drop = True)

In our 2020 tests, we had a tie in the Colley ratings between Atlanta and Carolina! Oh noooooooooo! Let's try breaking it by doing the following:
1. Check for H2H matchup winner (if there's a winner, we break the tie and stop)
2. Use the original combiner method ratings to break the tie

In this case, if there is no H2H winner, then since the tie has arisen with Colley used as the combiner method, we will look at the original Colley ratings to break the tie.

In [11]:
# Breaks a tie by checking for H2H matchup results, and then, if there are none, selecting the top team of an original
# rating list, i.e. a "partial dictator" list
# Input: df with game info, rating list with a tie, and "dictator" rating list that will break the tie
# Output: rating list with no ties
def break_tie(df, rating_lst_tie, rating_lst_dictator):
    # Get tied teams
    tied_teams = rating_lst_tie.groupby("Rank").filter(lambda sf: sf["Team"].count() > 1)["Team"].tolist()
    team1 = tied_teams[0]
    team2 = tied_teams[1]
    # Get data frame of game info for tie teams
    h2h = df[((df["Home Team"] == team1) & (df["Away Team"] == team2)) 
              | ((df["Home Team"] == team2) & (df["Away Team"] == team1))]
    # Put # wins for each team into dict
    num_wins = dict(h2h.groupby("Winner").count()["Loser"])
    # Check for H2H winner
    if max(num_wins.values()) - min(num_wins.values()) > 0:  # True if there is a H2H winner
        h2h_winner = max(num_wins, key = num_wins.get)
        h2h_loser = min(num_wins, key = num_wins.get)
    else:  # Find H2H winner in the dictator rating list
        tied_frame = rating_lst_dictator[rating_lst_dictator["Team"].isin(tied_teams)]
        h2h_winner = (tied_frame.loc[tied_frame["Rank"] == np.min(tied_frame["Rank"]), "Team"]).values[0]
        h2h_loser = (tied_frame.loc[tied_frame["Rank"] == np.max(tied_frame["Rank"]), "Team"]).values[0]
    # Change ranks in the list with the tie
    rating_lst_tie["Rank"] = np.where(rating_lst_tie["Team"] == h2h_winner,
                                      np.floor(rating_lst_tie["Rank"]), 
                                      rating_lst_tie["Rank"])
    rating_lst_tie["Rank"] = np.where(rating_lst_tie["Team"] == h2h_loser,
                                      np.ceil(rating_lst_tie["Rank"]),
                                      rating_lst_tie["Rank"])
    # Re-rank list
    rating_lst_tie["Rank"] = rating_lst_tie["Rank"].rank(ascending = True)
    # Return final rating list
    return rating_lst_tie.sort_values(by = "Rank", ascending = True)

Great! The Panthers are ahead of the Falcons because they were ahead in `full_2020_colley_ratings`, the "dictator" rating list.

### 2021 Ratings

We need 3 weeks of data to find a unique solution to the Masey and Colley algorithms. Hence, we need some of 2020's data to help us at the beginning of the 2021 season.

**Preseason**: We will regress 2020's final ratings by one-third, inspired by the function $y = \frac{1}{1.5^{w + 1}}$, which equals two-thirds when the input is zero. Hence this gives the desired regression percentage.

**In Season Idea 1**: $\text{Week} \ w_i \ \text{2021 Rating} = \frac{1}{1.5^{w_i + 1}}*\text{2020 Rating} + (1 - (\frac{1}{1.5^{w_i + 1}}))*\text{2021 rating with} \ w_i \ \text{2020 games replaced}$, for $w_i \in [1, 8]$.

This is a weighted average where the weight on 2020 ratings decreases exponentially. Now, to be able to still find a unique solution for our ratings, for ever week played I will replace that week's 2020 games with the corresponding week's 2021 games. For instance, after Week 1, I will remove all 2020 Week 1 games and replace them with all 2021 Week 1 games. This ensures we still have enough data to find unique solutions while also serving as its own weighting system that gives weight 0 to old 2020 games and weight 1 to new 2021 games.

The post-Week 8 ratings will be the last that consider 2020 data, and so post-Week 9 we will rely on 2021 data only. Thus for half the season we get (quickly diminishing) help from 2020 data, and after Week 9, when we should have a good idea who is good and who is bad, we use 2021 data only.

Note the function $y_i = \frac{1}{1.5^{w_i + 1}}$ where $y_i$ is the weight for week i $w_i$ is nice because, for the preseason $w_0$, we effectively regress the 2020 ratings the mean by one-third, a convention used by other popular forecasters like [538](https://fivethirtyeight.com/methodology/how-our-nfl-predictions-work/).

**Idea 2**: Just use $\text{Week} \ w_i \ \text{2021 Rating} = \frac{1}{1.5^{w_i + 1}}*\text{2020 Rating} + (1 - (\frac{1}{1.5^{w_i + 1}}))*\text{2021 rating with} \ {w_i} \ \text{early games replaced}$, for $w_i \in [1, 8]$, ignoring the replacement idea.

These ideas are fully implemented below in `do_ratings()`.

In [12]:
# replace_weeks() takes past game data and replaces it with corresponding new game data
# Will be used post-Weeks 1-8
def replace_weeks(datold, datnew):
    to_replace_df = datold.copy()  # Don't want to modify old game data
    #print(datold[0:33])
    to_replace_df.loc[to_replace_df["Week"].isin(datnew["Week"])] = np.nan  # Sets games to replace with NaN
    #print(to_replace_df[0:33])
    # Check if we have more games to replace (since especially for 2020, some games were canceled by COVID)
    num_to_concat = datnew.shape[0] - (to_replace_df.shape[0] - to_replace_df.dropna().shape[0])
    while num_to_concat > 0: 
        print(num_to_concat)
        to_replace_df = to_replace_df.append(pd.Series(), ignore_index = True)
        num_to_concat -= 1
    # Drop NaN rows and concat new 2021 data
    replaced = pd.concat([to_replace_df.dropna(axis = 0), datnew])
    # Clean replaced and return it
    replaced["Week"] = replaced["Week"].astype('int')
    return replaced.sort_values(by = "Game #", ascending = True).reset_index(drop = True)

In [13]:
# Given a week w, an old rating and new rating, returns weighted rating according to theory above
def get_weighted_rating(week, oldrating, newrating):
    y = 1 / (1.5 ** (week + 1))  # The weight
    rating = y * oldrating + (1 - y) * newrating
    return rating

### Rating Flow

In [14]:
# Takes two data frames: one of old data and one of new data, a week, a boolean replace for
# if we want to replace old data or not, and runs the Massey/Colling/Aggregating process
# Returns Massey-aggregated ratings
def do_ratings(datold, datnew, week, replace = True):
    # Check if preseason or in season
    if week == 0:  # True if in preseason
        # Get standard Massey and Colley ratings
        Mold, masseyold = get_massey(datold)
        Cold, colleyold = get_colley(datold, Mold)
        masseyold.to_csv("../data/nfl_2021/week" + str(week) + "/2020standard_massey_ratings.csv", index = False)
        colleyold.to_csv("../data/nfl_2021/week" + str(week) + "/2020standard_colley_ratings.csv", index = False)
    
        # Aggregate the above ratings with Massey as dictator
        massey_agg = aggregate_lists(datold, [masseyold, colleyold], get_massey)
        massey_agg_ratings = massey_agg[1]
    
        # Check for ties
        massey_tie = not len(massey_agg_ratings["Rank"].unique()) == 32
        print("Is there a tie in the Massey-combined list?", massey_tie)
        while massey_tie:
            # Call break_tie()
            massey_agg_ratings = break_tie(datold, massey_agg_ratings, masseyold)
            # Check again for a tie
            massey_tie = not len(massey_agg_ratings["Rank"].unique()) == 32
        massey_agg_ratings.to_csv("../data/nfl_2021/week" + str(week) + "/2020aggregated_massey_ratings.csv", index = False)
    
        # Weight the resulting list
        massey_agg_ratings["Weighted Rating"] = get_weighted_rating(week, massey_agg_ratings["Rating"], 0)
        massey_agg_ratings["Weighted O Rating"] = get_weighted_rating(week, massey_agg_ratings["O Rating"], 0)
        massey_agg_ratings["Weighted D Rating"] = get_weighted_rating(week, massey_agg_ratings["D Rating"], 0)
    
        # Rank ratings
        massey_agg_ratings["Rank"] = massey_agg_ratings["Weighted Rating"].rank(ascending = False)
        massey_agg_ratings = massey_agg_ratings.sort_values(by = "Rank", ascending = True).reset_index(drop = True)
    
    elif (week >= 1) and (week <= 8):  # True if in season and still need help from old data
        # Get standard Massey and Colley ratings for old data
        Mold, masseyold = get_massey(datold)
        Cold, colleyold = get_colley(datold, Mold)
        masseyold.to_csv("../data/nfl_2021/week" + str(week) + "/2020standard_massey_ratings.csv", index = False)
        colleyold.to_csv("../data/nfl_2021/week" + str(week) + "/2020standard_colley_ratings.csv", index = False)
    
        # Aggregate the above ratings with Massey as dictator
        massey_agg = aggregate_lists(datold, [masseyold, colleyold], get_massey)
        massey_agg_ratings = massey_agg[1]
    
        # Check for ties
        massey_tie = not len(massey_agg_ratings["Rank"].unique()) == 32
        print("Is there a tie in the Massey-combined list?", massey_tie)
        while massey_tie:
            # Call break_tie()
            massey_agg_ratings = break_tie(datold, massey_agg_ratings, masseyold)
            # Check again for a tie
            massey_tie = not len(massey_agg_ratings["Rank"].unique()) == 32
        massey_agg_ratings.to_csv("../data/nfl_2021/week" + str(week) + "/2020aggregated_massey_ratings.csv", index = False)
        
        # Combine old data with new data
        if replace:  # TRUE if we want to replace when we combine
            datold_combined = replace_weeks(datold, datnew)
        else:  # If we just want to combine
            datold_combined = pd.concat([datold, datnew], axis = 0)
    
        # Get Massey and Colley with combined data
        Mold_combined, masseyold_combined = get_massey(datold_combined)
        Cold_combined, colleyold_combined = get_colley(datold_combined, Mold_combined)
        masseyold_combined.to_csv("../data/nfl_2021/week" + str(week) + "/20and21combined_massey_ratings.csv", index = False)
        colleyold_combined.to_csv("../data/nfl_2021/week" + str(week) + "/20and21combined_colley_ratings.csv", index = False)
    
        # Aggregate the above ratings with Massey as dictator
        massey_agg_combined = aggregate_lists(datold_combined, [masseyold_combined, colleyold_combined], get_massey)
        massey_agg_combined_ratings = massey_agg_combined[1]
    
        # Check for ties
        massey_combined_tie = not len(massey_agg_combined_ratings["Rank"].unique()) == 32
        print("Is there a tie in the Massey-combined combined list?", massey_combined_tie)
        while massey_combined_tie:
            # Call break_tie()
            massey_agg_combined_ratings = break_tie(datold_combined, massey_agg_combined_ratings, masseyold_combined)
            # Check again for a tie
            massey_combined_tie = not len(massey_agg_combined_ratings["Rank"].unique()) == 32
        massey_agg_combined_ratings.to_csv("../data/nfl_2021/week" + str(week) + "/20and21aggregated_combined_massey_ratings.csv", index = False)
    
        # Merge old ratings with new, replaced ratings
        massey_agg_ratings = massey_agg_ratings.merge(massey_agg_combined_ratings, on = "Team", suffixes = ("_old", "_new"))
       
        # Regress old ratings to the mean by 1/3
        massey_agg_ratings["Rating_old"] = get_weighted_rating(0, massey_agg_ratings["Rating_old"], 0)
        massey_agg_ratings["O Rating_old"] = get_weighted_rating(0, massey_agg_ratings["O Rating_old"], 0)
        massey_agg_ratings["D Rating_old"] = get_weighted_rating(0, massey_agg_ratings["D Rating_old"], 0)
    
        # Get weighted ratings
        massey_agg_ratings["Weighted Rating"] = massey_agg_ratings.apply(lambda x: get_weighted_rating(week, 
                                                                                               x["Rating_old"], 
                                                                                               x["Rating_new"]), 
                                                                         axis = 1)
        massey_agg_ratings["Weighted O Rating"] = massey_agg_ratings.apply(lambda x: get_weighted_rating(week, 
                                                                                               x["O Rating_old"], 
                                                                                               x["O Rating_new"]), 
                                                                         axis = 1)
        massey_agg_ratings["Weighted D Rating"] = massey_agg_ratings.apply(lambda x: get_weighted_rating(week, 
                                                                                               x["D Rating_old"], 
                                                                                               x["D Rating_new"]), 
                                                                         axis = 1)
        # Rank ratings
        massey_agg_ratings["Rank"] = massey_agg_ratings["Weighted Rating"].rank(ascending = False)
        massey_agg_ratings = massey_agg_ratings.sort_values(by = "Rank", ascending = True).reset_index(drop = True)
    
    else:  # True if through 9 weeks or more
        # Get standard Massey and Colley ratings for new data
        Mnew, masseynew = get_massey(datnew)
        Cnew, colleynew = get_colley(datnew, Mnew)
        masseynew.to_csv("../data/nfl_2021/week" + str(week) + "/2021standard_massey_ratings.csv", index = False)
        colleynew.to_csv("../data/nfl_2021/week" + str(week) + "/2021standard_colley_ratings.csv", index = False)
    
        # Aggregate the above ratings with Massey as dictator
        massey_agg = aggregate_lists(datnew, [masseynew, colleynew], get_massey)
        massey_agg_ratings = massey_agg[1]
    
        # Check for ties
        massey_tie = not len(massey_agg_ratings["Rank"].unique()) == 32
        print("Is there a tie in the Massey-combined list?", massey_tie)
        while massey_tie:
            # Call break_tie()
            massey_agg_ratings = break_tie(datnew, massey_agg_ratings, masseynew)
            # Check again for a tie
            massey_tie = not len(massey_agg_ratings["Rank"].unique()) == 32
        massey_agg_ratings.to_csv("../data/nfl_2021/week" + str(week) + "/2021aggregated_massey_ratings.csv", index = False)

    # Write final ratings to csv
    massey_agg_ratings.to_csv("../data/nfl_2021/week" + str(week) + "/week" + str(week) + "final_massey_ratings.csv", index = False) 

    # Return ratings!
    return massey_agg_ratings

In [15]:
# Rate!

## CHANGE AS NECESSARY 

# The data
dat_old = load_data(2020)
dat_new = load_data(2021)

# How many weeks into the season we are
w = 13  # Through week 13

# Whether to replace
replace = False
    
final_ratings = do_ratings(dat_old, dat_new, w, replace)
final_ratings

Is there a tie in the Massey-combined list? False


Unnamed: 0,Team,Rating,O Rating,D Rating,Rank
0,Arizona Cardinals,11.875,17.943683,-6.068683,1.0
1,Tampa Bay Buccaneers,10.921875,17.008787,-6.086912,2.0
2,New England Patriots,10.125,16.352016,-6.227016,3.0
3,Dallas Cowboys,9.671875,15.850454,-6.178579,4.0
4,Green Bay Packers,8.9375,15.358266,-6.420766,5.0
5,Kansas City Chiefs,8.65625,14.984308,-6.328058,6.0
6,Buffalo Bills,7.046875,14.021287,-6.974412,7.0
7,Tennessee Titans,5.59375,12.453058,-6.859308,8.0
8,Los Angeles Rams,5.359375,12.277537,-6.918162,9.0
9,Baltimore Ravens,5.140625,12.268162,-7.127537,10.0


### How did our ratings do?

In [16]:
# Make WoW data frame
rating_lst = []

for i in range(0, w + 1):
    sub_new = dat_new[dat_new["Week"] <= i]
    rating_w = do_ratings(dat_old, sub_new, i, replace)
    if i >= 0 and i <= 8:
        rating_w = rating_w.rename(columns = {"Weighted Rating": "Rating Week " + str(i)})
    else:
        rating_w = rating_w.rename(columns = {"Rating": "Rating Week " + str(i)})
    rating_lst.append(rating_w)

Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined combined list? False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined list? False
Is there a tie in the Massey-combined list? False


In [17]:
# Join
## DONE FOR WEEKS 0 - 13
#wow_df = reduce(lambda x, y: pd.merge(x, y, on = 'Team', how = "inner"), rating_lst)

# Read in the data
wow_df = pd.read_csv('../data/nfl_wow_df.csv').iloc[:, 1:]

# Join with latest ratings
wow_df = wow_df.merge(final_ratings, on = "Team", how = "inner")

# Write data
wow_df.to_csv("../data/nfl_wow_df.csv")

In [18]:
wow_df.tail()

Unnamed: 0,Team,Rating,O Rating_x,D Rating_x,Rank_x,Rating Week 0,Weighted O Rating_x,Weighted D Rating_x,Rating_old_x,O Rating_old_x,...,D Rating_y.1,Rank_y.5,Rating Week 12,O Rating_x.2,D Rating_x.2,Rank_x.6,Rating Week 13,O Rating_y.2,D Rating_y.2,Rank_y.6
27,Detroit Lions,-8.734375,7.866952,-16.601327,28.0,-5.822917,5.244635,-11.067552,-5.822917,5.244635,...,-19.260433,32.0,-12.796875,6.080595,-18.87747,32.0,-11.359375,6.134829,-17.494204,29.0
28,Houston Texans,-8.84375,7.837265,-16.681015,29.0,-5.895833,5.224843,-11.120677,-5.895833,5.224843,...,-17.504183,29.0,-11.796875,6.097261,-17.894136,31.0,-12.875,6.085349,-18.960349,31.0
29,Philadelphia Eagles,-8.859375,7.887786,-16.747161,30.0,-5.90625,5.258524,-11.164774,-5.90625,5.258524,...,-9.668767,18.0,-1.625,8.441532,-10.066532,20.0,-1.046875,8.674412,-9.721287,20.0
30,New York Jets,-12.84375,7.637265,-20.481015,31.0,-8.5625,5.09151,-13.65401,-8.5625,5.09151,...,-19.261475,31.0,-11.453125,6.135803,-17.588928,29.0,-11.515625,6.12337,-17.638995,30.0
31,Jacksonville Jaguars,-13.40625,7.639348,-21.045598,32.0,-8.9375,5.092899,-14.030399,-8.9375,5.092899,...,-18.035954,30.0,-11.546875,6.105595,-17.65247,30.0,-12.890625,6.08587,-18.976495,32.0


In [21]:
# Takes game data, a set of ratings, a week >= 1, and returns foresight accuracy for given week
def foresight_acc(game_dat, ratings, week):
    week_games = game_dat[game_dat['Week'] == week]
    prev_week_ratings = ratings.loc[:, ['Team', 'Rating Week ' + str(week - 1)]]
    merged = (week_games.merge(prev_week_ratings, left_on = 'Winner', right_on = 'Team')
         .merge(prev_week_ratings, left_on = 'Loser', right_on = 'Team', suffixes=('_W', '_L')))
    #print(merged.head())
    # If tie in ratings, predict home team
    conditions = [merged['Rating Week ' + str(week - 1) + '_W'] > merged['Rating Week ' + str(week - 1) + '_L'], 
                 #(merged['Rating Week ' + str(week - 1) + '_W'] == merged['Rating Week ' + str(week - 1) + '_L'])
                  #& (merged['Winner'] == merged['Home Team']),
                  #(merged['Rating Week ' + str(week - 1) + '_W'] == merged['Rating Week ' + str(week - 1) + '_L'])
                  #& (merged['Winner'] != merged['Home Team']),
                 merged['Rating Week ' + str(week - 1) + '_W'] < merged['Rating Week ' + str(week - 1) + '_L']]
    choices =  [1, 0]#[1, 1, 0, 0]
    merged['Correct?'] = np.select(conditions, choices)
    n_correct = sum(merged['Correct?'])
    n_games = merged.shape[0]
    week_acc = n_correct / n_games
    return n_correct, n_games, week_acc

In [22]:
# Find foresight accuracy week over week
foresight_accs = []
correct = 0
total = 0
for i in range(1, w + 1):
    res = foresight_acc(dat_new, wow_df, i)
    foresight_accs.append(res[2])
    correct += res[0]
    total += res[1]
foresight_accs

[0.6875,
 0.6875,
 0.625,
 0.625,
 0.8125,
 0.7142857142857143,
 0.6923076923076923,
 0.6666666666666666,
 0.6428571428571429,
 0.42857142857142855,
 0.6,
 0.4666666666666667,
 0.5714285714285714]

In [23]:
print(correct)
print(total)

123
194


In [24]:
# Average foresight accuracy week over week
np.mean(foresight_accs)

0.6323295294449142

In [25]:
# Total foresight accuracy
correct / total

0.634020618556701

In [26]:
# Takes game data, the LATEST set of ratings, and returns hindsight accuracy week over week
def hindsight_acc(game_dat, ratings):
    #game_dat = game_dat[~game_dat['Pts_W'].isna()]
    latest_ratings = ratings.loc[:, ['Team', 'Rating Week ' + str(w)]]  # week w is globally defined
    #print(latest_ratings)
    merged = (game_dat.merge(latest_ratings, left_on = 'Winner', right_on = 'Team',)
         .merge(latest_ratings, left_on = 'Loser', right_on = 'Team', suffixes = ('_W', '_L')))
    merged['Correct?'] = np.where(merged['Rating Week ' + str(w) + '_W'] > merged['Rating Week ' + str(w) + '_L'], 1, 0)
    hind_acc = np.mean(merged['Correct?'])
    return hind_acc

In [27]:
# Hindsight accuracy
hindsight_acc(dat_new, wow_df)

0.711340206185567