In [3]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as plticker
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

plt.style.use('seaborn-talk')
plt.style.use('ggplot')

pd.set_option('display.max_columns', 7)

  plt.style.use('seaborn-talk')


In [6]:
YEARS = range(2010,2023)

data = pd.DataFrame()

for i in YEARS:  
    i_data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(i) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data = pd.concat([data, i_data], sort=True)

In [42]:
print(data.columns)

Index(['aborted_play', 'air_epa', 'air_wpa', 'air_yards', 'assist_tackle',
       'assist_tackle_1_player_id', 'assist_tackle_1_player_name',
       'assist_tackle_1_team', 'assist_tackle_2_player_id',
       'assist_tackle_2_player_name',
       ...
       'xyac_median_yardage', 'xyac_success', 'yac_epa', 'yac_wpa',
       'yardline_100', 'yards_after_catch', 'yards_gained', 'ydsnet',
       'ydstogo', 'yrdln'],
      dtype='object', length=372)


In [62]:
import functools
def dynamic_window_ewma(x):
    """
    Calculate rolling exponentially weighted EPA with a dynamic window size
    """
    values = np.zeros(len(x))
    for i, (_, row) in enumerate(x.iterrows()):
        epa = x.epa_shifted[:i+1]
        if row.week > 10:
            values[i] = epa.ewm(min_periods=1, span=row.week).mean().values[-1]
        else:
            values[i] = epa.ewm(min_periods=1, span=10).mean().values[-1]
            
    return pd.Series(values, index=x.index)

# seperate EPA in to rushing offense, rushing defense, passing offense, passing defense for each team
rushing_offense_epa = data.loc[data['rush_attempt'] == 1, :]\
.groupby(['posteam', 'season', 'week'], as_index=False)['epa'].mean()

rushing_defense_epa = data.loc[data['rush_attempt'] == 1, :]\
.groupby(['defteam', 'season', 'week'], as_index=False)['epa'].mean()

passing_offense_epa = data.loc[data['pass_attempt'] == 1, :]\
.groupby(['posteam', 'season', 'week'], as_index=False)['epa'].mean()

passing_defense_epa = data.loc[data['pass_attempt'] == 1, :]\
.groupby(['defteam', 'season', 'week'], as_index=False)['epa'].mean()

extra_point_offense_epa = data.loc[data['extra_point_attempt'] == 1, :]\
.groupby(['posteam', 'season', 'week'], as_index=False)['epa'].mean()

extra_point_defense_epa = data.loc[data['extra_point_attempt'] == 1, :]\
.groupby(['defteam', 'season', 'week'], as_index=False)['epa'].mean()

two_point_offense_epa = data.loc[data['two_point_attempt'] == 1, :]\
.groupby(['posteam', 'season', 'week'], as_index=False)['epa'].mean()

two_point_defense_epa = data.loc[data['two_point_attempt'] == 1, :]\
.groupby(['defteam', 'season', 'week'], as_index=False)['epa'].mean()

field_goal_offense_epa = data.loc[data['field_goal_attempt'] == 1, :]\
.groupby(['posteam', 'season', 'week'], as_index=False)['epa'].mean()

field_goal_defense_epa = data.loc[data['field_goal_attempt'] == 1, :]\
.groupby(['defteam', 'season', 'week'], as_index=False)['epa'].mean()

# lag EPA one period back
rushing_offense_epa['epa_shifted'] = rushing_offense_epa.groupby('posteam')['epa'].shift()
rushing_defense_epa['epa_shifted'] = rushing_defense_epa.groupby('defteam')['epa'].shift()
passing_offense_epa['epa_shifted'] = passing_offense_epa.groupby('posteam')['epa'].shift()
passing_defense_epa['epa_shifted'] = passing_defense_epa.groupby('defteam')['epa'].shift()

extra_point_offense_epa['epa_shifted'] = rushing_offense_epa.groupby('posteam')['epa'].shift()
extra_point_defense_epa['epa_shifted'] = rushing_defense_epa.groupby('defteam')['epa'].shift()
two_point_offense_epa['epa_shifted'] = passing_offense_epa.groupby('posteam')['epa'].shift()
two_point_defense_epa['epa_shifted'] = passing_defense_epa.groupby('defteam')['epa'].shift()
field_goal_offense_epa['epa_shifted'] = passing_offense_epa.groupby('posteam')['epa'].shift()
field_goal_defense_epa['epa_shifted'] = passing_defense_epa.groupby('defteam')['epa'].shift()

# In each case, calculate EWMA with a static window and dynamic window and assign it as a column 
rushing_offense_epa['ewma'] = rushing_offense_epa.groupby('posteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

rushing_offense_epa['ewma_dynamic_window'] = rushing_offense_epa.groupby('posteam')\
.apply(dynamic_window_ewma).values

rushing_defense_epa['ewma'] = rushing_defense_epa.groupby('defteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

rushing_defense_epa['ewma_dynamic_window'] = rushing_defense_epa.groupby('defteam')\
.apply(dynamic_window_ewma).values

passing_offense_epa['ewma'] = passing_offense_epa.groupby('posteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

passing_offense_epa['ewma_dynamic_window'] = passing_offense_epa.groupby('posteam')\
.apply(dynamic_window_ewma).values

passing_defense_epa['ewma'] = passing_defense_epa.groupby('defteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

passing_defense_epa['ewma_dynamic_window'] = passing_defense_epa.groupby('defteam')\
.apply(dynamic_window_ewma).values

#-----------------------------------------------------------------------------------
extra_point_offense_epa['ewma'] = extra_point_offense_epa.groupby('posteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

extra_point_offense_epa['ewma_dynamic_window'] = extra_point_offense_epa.groupby('posteam')\
.apply(dynamic_window_ewma).values

extra_point_defense_epa['ewma'] = extra_point_defense_epa.groupby('defteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

extra_point_defense_epa['ewma_dynamic_window'] = extra_point_defense_epa.groupby('defteam')\
.apply(dynamic_window_ewma).values

two_point_offense_epa['ewma'] = two_point_offense_epa.groupby('posteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

two_point_offense_epa['ewma_dynamic_window'] = two_point_offense_epa.groupby('posteam')\
.apply(dynamic_window_ewma).values

two_point_defense_epa['ewma'] = two_point_defense_epa.groupby('defteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

two_point_defense_epa['ewma_dynamic_window'] = two_point_defense_epa.groupby('defteam')\
.apply(dynamic_window_ewma).values

field_goal_offense_epa['ewma'] = field_goal_offense_epa.groupby('posteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

field_goal_offense_epa['ewma_dynamic_window'] = field_goal_offense_epa.groupby('posteam')\
.apply(dynamic_window_ewma).values

field_goal_defense_epa['ewma'] = field_goal_defense_epa.groupby('defteam')['epa_shifted']\
.transform(lambda x: x.ewm(min_periods=1, span=10).mean())

field_goal_defense_epa['ewma_dynamic_window'] = field_goal_defense_epa.groupby('defteam')\
.apply(dynamic_window_ewma).values

offense_epa_stats = [rushing_offense_epa ,passing_offense_epa, extra_point_offense_epa, two_point_offense_epa, field_goal_offense_epa]
defense_epa_stats = [rushing_defense_epa ,passing_defense_epa, extra_point_defense_epa, two_point_defense_epa, field_goal_defense_epa]



offense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['posteam', 'season', 'week']), offense_epa_stats).rename(columns={'posteam':'team'})
defense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['defteam', 'season', 'week']), defense_epa_stats).rename(columns={'defteam':'team'})


#Merge all the data together
'''offense_epa = rushing_offense_epa.merge(passing_offense_epa, on=['posteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'posteam': 'team'})
defense_epa = rushing_defense_epa.merge(passing_defense_epa, on=['defteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'defteam': 'team'})'''
epa = offense_epa.merge(defense_epa, on=['team', 'season', 'week'], suffixes=('_offense', '_defense'))

#remove the first season of data
epa = epa.loc[epa['season'] != epa['season'].unique()[0], :]

epa = epa.reset_index(drop=True)

epa.head()

  offense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['posteam', 'season', 'week']), offense_epa_stats).rename(columns={'posteam':'team'})
  defense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['defteam', 'season', 'week']), defense_epa_stats).rename(columns={'defteam':'team'})


Unnamed: 0,team,season,week,...,epa_shifted_defense,ewma_defense,ewma_dynamic_window_defense
0,ARI,2021,17,...,0.111737,-0.012883,0.02123
1,ARI,2022,4,...,0.058177,-0.060248,-0.060248
2,ATL,2010,7,...,0.098798,0.168019,0.168019
3,ATL,2015,5,...,-0.058651,0.182314,0.182314
4,ATL,2016,8,...,0.221841,0.246001,0.246001


In [63]:
schedule = data[['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score']]\
.drop_duplicates().reset_index(drop=True)\
.assign(home_team_win = lambda x: (x.home_score > x.away_score).astype(int))

df = schedule.merge(epa.rename(columns={'team': 'home_team'}), on=['home_team', 'season', 'week'])\
.merge(epa.rename(columns={'team': 'away_team'}), on=['away_team', 'season', 'week'], suffixes=('_home', '_away'))

df.head()

Unnamed: 0,season,week,home_team,...,epa_shifted_defense_away,ewma_defense_away,ewma_dynamic_window_defense_away
0,2010,7,ATL,...,-0.137667,-0.064572,-0.064572
1,2010,8,DET,...,-0.042498,-0.237979,-0.237979
2,2010,10,PIT,...,-0.179258,0.130517,0.130517
3,2011,19,SF,...,0.399189,0.174513,0.153334
4,2012,1,DEN,...,0.409467,-0.04681,-0.04681


In [64]:
target = 'home_team_win'
features = [column for column in df.columns if 'ewma' in column and 'dynamic' in column]
for feature in features:
  print(feature)

ewma_dynamic_window_x_offense_home
ewma_dynamic_window_y_offense_home
ewma_dynamic_window_x_offense_home
ewma_dynamic_window_y_offense_home
ewma_dynamic_window_offense_home
ewma_dynamic_window_x_defense_home
ewma_dynamic_window_y_defense_home
ewma_dynamic_window_x_defense_home
ewma_dynamic_window_y_defense_home
ewma_dynamic_window_defense_home
ewma_dynamic_window_x_offense_away
ewma_dynamic_window_y_offense_away
ewma_dynamic_window_x_offense_away
ewma_dynamic_window_y_offense_away
ewma_dynamic_window_offense_away
ewma_dynamic_window_x_defense_away
ewma_dynamic_window_y_defense_away
ewma_dynamic_window_x_defense_away
ewma_dynamic_window_y_defense_away
ewma_dynamic_window_defense_away


In [65]:
df = df.dropna()

X = df.loc[df['season'] != 2023, features].values
y = df.loc[df['season'] != 2023, target].values

clf = LogisticRegression()
clf.fit(X, y)

In [75]:
accuracy_scores = cross_val_score(clf, X, y, cv=10)
log_losses = cross_val_score(clf, X, y, cv=10, scoring='neg_log_loss')

print('Model Accuracy:', np.mean(accuracy_scores))

Model Accuracy: 0.5166666666666666


In [76]:
print('Neg log loss:', np.mean(log_losses))

Neg log loss: -0.6652042321073346


In [74]:
df_2020 = df.loc[(df['season'] == 2021)].assign(
    predicted_winner = lambda x: clf.predict(x[features]),
    home_team_win_probability = lambda x: clf.predict_proba(x[features])[:, 1]
)\
[['home_team', 'away_team', 'week', 'predicted_winner', 'home_team_win_probability', 'home_team_win']]

df_2020['actual_winner'] = df_2020.apply(lambda x: x.home_team if x.home_team_win else x.away_team, axis=1)
df_2020['predicted_winner'] = df_2020.apply(lambda x: x.home_team if x.predicted_winner == 1 else x.away_team, axis=1)
df_2020['win_probability'] = df_2020.apply(lambda x: x.home_team_win_probability if x.predicted_winner == x.home_team else 1 - x.home_team_win_probability, axis=1)
df_2020['correct_prediction'] = (df_2020['predicted_winner'] == df_2020['actual_winner']).astype(int)

df_2020 = df_2020.drop(columns=['home_team_win_probability', 'home_team_win'])

df_2020.sort_values(by='win_probability', ascending=False).reset_index(drop=True).head(10)



Unnamed: 0,home_team,away_team,week,predicted_winner,actual_winner,win_probability,correct_prediction
0,GB,LA,12,GB,GB,0.643699,1
1,LV,LAC,18,LV,LV,0.58465,1
2,CIN,LAC,13,CIN,LAC,0.56488,0
3,ATL,WAS,4,ATL,WAS,0.555662,0
4,SF,IND,7,SF,IND,0.55505,0
5,CAR,MIN,6,MIN,MIN,0.523035,1
6,DAL,ARI,17,ARI,ARI,0.507623,1
7,PIT,BAL,13,PIT,PIT,0.501544,1


In [69]:
correct = df_2020.loc[df_2020['correct_prediction'] == 1].groupby('week')['correct_prediction'].sum()

num_games = df_2020.groupby('week')['correct_prediction'].size()

results = correct / num_games

results

week
4     1.0
5     1.0
6     NaN
11    1.0
12    1.0
15    0.5
19    1.0
Name: correct_prediction, dtype: float64

In [70]:
print(df_2020.loc[df_2020['week'] == results.idxmax()].sort_values(by='win_probability', ascending=False))

   home_team away_team  week predicted_winner actual_winner  win_probability  \
42       CAR       ARI     4              ARI           ARI         0.586249   

    correct_prediction  
42                   1  


In [71]:
df_2020.loc[df_2020['week'] > 17]

Unnamed: 0,home_team,away_team,week,predicted_winner,actual_winner,win_probability,correct_prediction
52,SF,SEA,19,SF,SF,0.602343,1


In [72]:
import itertools

def ewma(data, window):
    """
    Calculate the most recent value for EWMA given an array of data and a window size
    """
    alpha = 2 / (window + 1.0)
    alpha_rev = 1 - alpha
    scale = 1 / alpha_rev
    n = data.shape[0]
    r = np.arange(n)
    scale_arr = scale**r
    offset = data[0] * alpha_rev**(r+1)
    pw0 = alpha * alpha_rev**(n-1)
    mult = data * pw0 * scale_arr
    cumsums = mult.cumsum()
    out = offset + cumsums * scale_arr[::-1]
    return out[-1]

data_2020 = data.loc[(data['season'] == 2020)]
offense = data_2020.loc[(data_2020['posteam'] == 'KC') | (data_2020['posteam'] == 'TB')]
defense = data_2020.loc[(data_2020['defteam'] == 'KC') | (data_2020['defteam'] == 'TB')]

rushing_offense = offense.loc[offense['rush_attempt'] == 1]\
.groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})
passing_offense = offense.loc[offense['pass_attempt'] == 1]\
.groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})
rushing_defense = defense.loc[defense['rush_attempt'] == 1]\
.groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})
passing_defense = defense.loc[defense['pass_attempt'] == 1]\
.groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})

super_bowl_X = np.zeros(8)

for i, (tm, stat_df) in enumerate(itertools.product(['KC', 'TB'], [rushing_offense, passing_offense, rushing_defense, passing_defense])):
    ewma_value = ewma(stat_df.loc[stat_df['team'] == tm]['epa'].values, 20)
    super_bowl_X[i] = ewma_value

predicted_winner = clf.predict(super_bowl_X.reshape(1, 36))[0]
predicted_proba = clf.predict_proba(super_bowl_X.reshape(1, 36))[0]

winner = 'KC' if predicted_winner else 'TB'
win_prob = predicted_proba[-1] if predicted_winner else predicted_proba[0]

print(f'Model predicts {winner} will win the Super Bowl and has a {round(win_prob*100, 2)}% win probability')

ValueError: X has 8 features, but LogisticRegression is expecting 36 features as input.