In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as plticker
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

plt.style.use('seaborn-talk')
plt.style.use('ggplot')

pd.set_option('display.max_columns', 7)

  plt.style.use('seaborn-talk')


In [2]:
YEARS = range(2010,2023)

data = pd.DataFrame()

for i in YEARS:  
    i_data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(i) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data = pd.concat([data, i_data], sort=True)

In [3]:
print(data.columns)

Index(['aborted_play', 'air_epa', 'air_wpa', 'air_yards', 'assist_tackle',
       'assist_tackle_1_player_id', 'assist_tackle_1_player_name',
       'assist_tackle_1_team', 'assist_tackle_2_player_id',
       'assist_tackle_2_player_name',
       ...
       'xyac_median_yardage', 'xyac_success', 'yac_epa', 'yac_wpa',
       'yardline_100', 'yards_after_catch', 'yards_gained', 'ydsnet',
       'ydstogo', 'yrdln'],
      dtype='object', length=372)


In [234]:
import functools
def dynamic_window_ewma(x):
    """
    Calculate rolling exponentially weighted EPA with a dynamic window size
    """
    values = np.zeros(len(x))
    for i, (_, row) in enumerate(x.iterrows()):
        epa = x.epa_shifted[:i+1]
        if row.week > 10:
            values[i] = epa.ewm(min_periods=1, span=row.week).mean().values[-1]
        else:
            values[i] = epa.ewm(min_periods=1, span=10).mean().values[-1]
            
    return pd.Series(values, index=x.index)

offense_columns_epas = {}
defense_columns_epas = {}

feats = [
    "rush_attempt",
    "pass_attempt",
    "extra_point_attempt",
    "two_point_attempt",
    "field_goal_attempt",
    #"complete_pass",
    "qb_hit",
    #"tackled_for_loss",
    #"penalty",
    #"sack",
    #"fumble",

]

for column in feats:
    df = data.loc[data[column] == 1, :]\
    .groupby(['posteam', 'season', 'week'], as_index=False)['epa'].mean()

    if not df.empty:
        offense_columns_epas[column] = df

    df = data.loc[data[column] == 1, :]\
    .groupby(['defteam', 'season', 'week'], as_index=False)['epa'].mean()
    
    if not df.empty:
        defense_columns_epas[column] = df



In [235]:

#print(defense_columns_epas)
for column_epa in offense_columns_epas.keys():
    #print(column_epa)
    offense_columns_epas[column_epa]['epa_shifted'] = offense_columns_epas[column_epa].groupby('posteam')['epa'].shift()
    offense_columns_epas[column_epa]['ewma'] = offense_columns_epas[column_epa].groupby('posteam')['epa_shifted']\
    .transform(lambda x: x.ewm(min_periods=1, span=10).mean())
    offense_columns_epas[column_epa]['ewma_dynamic_window'] = offense_columns_epas[column_epa].groupby('posteam')\
    .apply(dynamic_window_ewma).values

for column_epa in defense_columns_epas.keys():
    #print(column_epa)
    defense_columns_epas[column_epa]['epa_shifted'] = defense_columns_epas[column_epa].groupby('defteam')['epa'].shift()
    defense_columns_epas[column_epa]['ewma'] = defense_columns_epas[column_epa].groupby('defteam')['epa_shifted']\
    .transform(lambda x: x.ewm(min_periods=1, span=10).mean())
    defense_columns_epas[column_epa]['ewma_dynamic_window'] = defense_columns_epas[column_epa].groupby('defteam')\
    .apply(dynamic_window_ewma).values


#print(offense_columns_epas)

offense_epa_stats = list(offense_columns_epas.values())
defense_epa_stats = list(defense_columns_epas.values())
suffixes = ['_'+suffix for suffix in list(offense_columns_epas.keys())]

offense_epa = offense_epa_stats[0]
defense_epa = defense_epa_stats[0]

for i in range(1,len(offense_epa_stats)):
    
    offense_epa = offense_epa.merge(offense_epa_stats[i], on=['posteam', 'season', 'week'], suffixes=(suffixes[i-1],suffixes[i]))
    defense_epa = defense_epa.merge(defense_epa_stats[i], on=['defteam', 'season', 'week'], suffixes=(suffixes[i-1],suffixes[i]))

offense_epa = offense_epa.rename(columns={'posteam':'team'})
defense_epa = defense_epa.rename(columns={'defteam':'team'})

print(offense_epa)

#offense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['posteam', 'season', 'week']), offense_epa_stats).rename(columns={'posteam':'team'})
#defense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['defteam', 'season', 'week']), defense_epa_stats).rename(columns={'defteam':'team'})


#Merge all the data together
'''offense_epa = rushing_offense_epa.merge(passing_offense_epa, on=['posteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'posteam': 'team'})
defense_epa = rushing_defense_epa.merge(passing_defense_epa, on=['defteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'defteam': 'team'})'''
epa = offense_epa.merge(defense_epa, on=['team', 'season', 'week'], suffixes=('_offense', '_defense'))

#remove the first season of data
epa = epa.loc[epa['season'] != epa['season'].unique()[0], :]

epa = epa.reset_index(drop=True)

epa.head()

     team  season  week  ...  epa_shifted      ewma  ewma_dynamic_window
0     ARI    2010     1  ...          NaN       NaN                  NaN
1     ARI    2010     2  ...     0.180073  0.180073             0.180073
2     ARI    2010     3  ...    -0.879389 -0.402631            -0.402631
3     ARI    2010     4  ...    -1.193457 -0.720538            -0.720538
4     ARI    2010     5  ...    -1.527638 -0.986442            -0.986442
...   ...     ...   ...  ...          ...       ...                  ...
5504  WAS    2022    12  ...    -0.067660 -1.166918            -1.177798
5505  WAS    2022    13  ...     0.627167 -0.840721            -0.922467
5506  WAS    2022    15  ...    -0.683689 -0.812169            -0.923327
5507  WAS    2022    17  ...    -1.575660 -1.186319            -1.162555
5508  WAS    2022    18  ...    -1.371051 -1.219907            -1.183529

[5509 rows x 23 columns]


Unnamed: 0,team,season,week,...,epa_shifted_defense,ewma_defense,ewma_dynamic_window_defense
0,ARI,2011,3,...,0.041251,-0.812416,-0.812416
1,ARI,2011,4,...,-0.702975,-0.792068,-0.792068
2,ARI,2011,5,...,-0.743171,-0.783014,-0.783014
3,ARI,2011,8,...,-0.495798,-0.767411,-0.767411
4,ARI,2011,10,...,-1.340605,-0.763054,-0.763054


In [236]:
schedule = data[['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score']]\
.drop_duplicates().reset_index(drop=True)\
.assign(home_team_win = lambda x: (x.home_score > x.away_score).astype(int))

df = schedule.merge(epa.rename(columns={'team': 'home_team'}), on=['home_team', 'season', 'week'])\
.merge(epa.rename(columns={'team': 'away_team'}), on=['away_team', 'season', 'week'], suffixes=('_home', '_away'))

df.head()

Unnamed: 0,season,week,home_team,...,epa_shifted_defense_away,ewma_defense_away,ewma_dynamic_window_defense_away
0,2011,1,KC,...,-2.52124,-1.463558,-1.463558
1,2011,1,CLE,...,-1.864505,-0.854676,-0.854676
2,2011,1,NYJ,...,-0.802257,-0.626768,-0.626768
3,2011,1,HOU,...,-1.755864,-1.086451,-1.086451
4,2011,1,LAC,...,-5.247881,-1.706266,-1.706266


In [237]:
target = 'home_team_win'
print(df.columns)
features = [column for column in df.columns if 'ewma' in column and 'dynamic' in column]
for feature in features:
  print(feature)

Index(['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score',
       'home_team_win', 'epa_rush_attempt_offense_home',
       'epa_shifted_rush_attempt_offense_home',
       'ewma_rush_attempt_offense_home',
       'ewma_dynamic_window_rush_attempt_offense_home',
       'epa_pass_attempt_offense_home',
       'epa_shifted_pass_attempt_offense_home',
       'ewma_pass_attempt_offense_home',
       'ewma_dynamic_window_pass_attempt_offense_home',
       'epa_extra_point_attempt_offense_home',
       'epa_shifted_extra_point_attempt_offense_home',
       'ewma_extra_point_attempt_offense_home',
       'ewma_dynamic_window_extra_point_attempt_offense_home',
       'epa_field_goal_attempt_offense_home',
       'epa_shifted_field_goal_attempt_offense_home',
       'ewma_field_goal_attempt_offense_home',
       'ewma_dynamic_window_field_goal_attempt_offense_home',
       'epa_offense_home', 'epa_shifted_offense_home', 'ewma_offense_home',
       'ewma_dynamic_window_offense_

In [238]:
df = df.dropna()

X = df.loc[df['season'] != 2023, features].values
y = df.loc[df['season'] != 2023, target].values

print(df.loc[df['season'] != 2023])

clf = LogisticRegression()
clf.fit(X, y)

      season  week home_team  ... epa_shifted_defense_away  ewma_defense_away  \
0       2011     1        KC  ...                -2.521240          -1.463558   
1       2011     1       CLE  ...                -1.864505          -0.854676   
2       2011     1       NYJ  ...                -0.802257          -0.626768   
3       2011     1       HOU  ...                -1.755864          -1.086451   
4       2011     1       LAC  ...                -5.247881          -1.706266   
...      ...   ...       ...  ...                      ...                ...   
1978    2022    20       BUF  ...                -0.652381          -0.851103   
1979    2022    20        SF  ...                -1.253254          -1.322795   
1980    2022    20        KC  ...                -0.181864          -0.673056   
1981    2022    21        KC  ...                -0.048040          -0.705092   
1982    2022    22       PHI  ...                -1.024704          -1.086291   

      ewma_dynamic_window_d

In [239]:
accuracy_scores = cross_val_score(clf, X, y, cv=14)
log_losses = cross_val_score(clf, X, y, cv=10, scoring='neg_log_loss')

print('Model Accuracy:', np.mean(accuracy_scores))

Model Accuracy: 0.6071535596558072


In [240]:
print('Neg log loss:', np.mean(log_losses))

Neg log loss: -0.6487224658681197


In [221]:
df_2020 = df.loc[(df['season'] == 2022)].assign(
    predicted_winner = lambda x: clf.predict(x[features]),
    home_team_win_probability = lambda x: clf.predict_proba(x[features])[:, 1]
)\
[['home_team', 'away_team', 'week', 'predicted_winner', 'home_team_win_probability', 'home_team_win']]

df_2020['actual_winner'] = df_2020.apply(lambda x: x.home_team if x.home_team_win else x.away_team, axis=1)
df_2020['predicted_winner'] = df_2020.apply(lambda x: x.home_team if x.predicted_winner == 1 else x.away_team, axis=1)
df_2020['win_probability'] = df_2020.apply(lambda x: x.home_team_win_probability if x.predicted_winner == x.home_team else 1 - x.home_team_win_probability, axis=1)
df_2020['correct_prediction'] = (df_2020['predicted_winner'] == df_2020['actual_winner']).astype(int)

df_2020 = df_2020.drop(columns=['home_team_win_probability', 'home_team_win'])

df_2020.sort_values(by='win_probability', ascending=False).reset_index(drop=True).head(10)



Unnamed: 0,home_team,away_team,week,predicted_winner,actual_winner,win_probability,correct_prediction
0,KC,LV,5,KC,KC,0.7824,1
1,BUF,CLE,11,BUF,BUF,0.733102,1
2,MIN,CHI,5,MIN,MIN,0.719605,1
3,SF,SEA,19,SF,SF,0.716005,1
4,TB,ATL,5,TB,TB,0.711579,1
5,JAX,BAL,12,JAX,JAX,0.68536,1
6,NO,SEA,5,SEA,NO,0.596017,0
7,CAR,ARI,4,ARI,ARI,0.57684,1
8,TB,CIN,15,CIN,CIN,0.562458,1
9,BUF,MIA,15,BUF,BUF,0.548883,1


In [222]:
correct = df_2020.loc[df_2020['correct_prediction'] == 1].groupby('week')['correct_prediction'].sum()

num_games = df_2020.groupby('week')['correct_prediction'].size()

results = correct / num_games

results

week
4     1.00
5     0.75
6     1.00
11    1.00
12    1.00
15    1.00
19    1.00
Name: correct_prediction, dtype: float64

In [223]:
print(df_2020.loc[df_2020['week'] == results.idxmax()].sort_values(by='win_probability', ascending=False))

   home_team away_team  week predicted_winner actual_winner  win_probability  \
42       CAR       ARI     4              ARI           ARI          0.57684   

    correct_prediction  
42                   1  


In [224]:
df_2020.loc[df_2020['week'] > 17]

Unnamed: 0,home_team,away_team,week,predicted_winner,actual_winner,win_probability,correct_prediction
52,SF,SEA,19,SF,SF,0.716005,1


In [225]:
import itertools

def ewma(data, window):
    """
    Calculate the most recent value for EWMA given an array of data and a window size
    """
    alpha = 2 / (window + 1.0)
    alpha_rev = 1 - alpha
    scale = 1 / alpha_rev
    n = data.shape[0]
    r = np.arange(n)
    scale_arr = scale**r
    offset = data[0] * alpha_rev**(r+1)
    pw0 = alpha * alpha_rev**(n-1)
    mult = data * pw0 * scale_arr
    cumsums = mult.cumsum()
    out = offset + cumsums * scale_arr[::-1]
    return out[-1]

data_2020 = data.loc[(data['season'] == 2022)]
offense = data_2020.loc[(data_2020['posteam'] == 'KC') | (data_2020['posteam'] == 'PHI')]
defense = data_2020.loc[(data_2020['defteam'] == 'KC') | (data_2020['defteam'] == 'PHI')]



offense_dic = {}
defense_dic = {}

for f in feats:
    offense_dic[f] = offense.loc[offense[f] == 1]\
    .groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})
    defense_dic[f] = defense.loc[defense[f] == 1]\
    .groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})
    

super_bowl_X = np.zeros(len(feats) * 4)

for i, (tm, stat_df) in enumerate(itertools.product(['KC', 'PHI'], (list(offense_dic.values()) + list(defense_dic.values())))):
    #print(stat_df)
    ewma_value = ewma(stat_df.loc[stat_df['team'] == tm]['epa'].values, len(feats) * 4)
    super_bowl_X[i] = ewma_value

predicted_winner = clf.predict(super_bowl_X.reshape(1, len(feats) * 4))[0]
predicted_proba = clf.predict_proba(super_bowl_X.reshape(1, len(feats) * 4))[0]

winner = 'KC' if predicted_winner else 'PHI'
win_prob = predicted_proba[-1] if predicted_winner else predicted_proba[0]

print(f'Model predicts {winner} will win the Super Bowl and has a {round(win_prob*100, 2)}% win probability')

Model predicts KC will win the Super Bowl and has a 90.11% win probability
