In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as plticker
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

plt.style.use('seaborn-talk')
plt.style.use('ggplot')

pd.set_option('display.max_columns', 7)

  plt.style.use('seaborn-talk')


In [2]:
YEARS = range(2010,2023)

data = pd.DataFrame()

for i in YEARS:  
    i_data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(i) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data = pd.concat([data, i_data], sort=True)

In [3]:
print(data.columns)

Index(['aborted_play', 'air_epa', 'air_wpa', 'air_yards', 'assist_tackle',
       'assist_tackle_1_player_id', 'assist_tackle_1_player_name',
       'assist_tackle_1_team', 'assist_tackle_2_player_id',
       'assist_tackle_2_player_name',
       ...
       'xyac_median_yardage', 'xyac_success', 'yac_epa', 'yac_wpa',
       'yardline_100', 'yards_after_catch', 'yards_gained', 'ydsnet',
       'ydstogo', 'yrdln'],
      dtype='object', length=372)


In [64]:
import functools
def dynamic_window_ewma(x):
    """
    Calculate rolling exponentially weighted EPA with a dynamic window size
    """
    values = np.zeros(len(x))
    for i, (_, row) in enumerate(x.iterrows()):
        epa = x.epa_shifted[:i+1]
        if row.week > 10:
            values[i] = epa.ewm(min_periods=1, span=row.week).mean().values[-1]
        else:
            values[i] = epa.ewm(min_periods=1, span=10).mean().values[-1]
            
    return pd.Series(values, index=x.index)

offense_columns_epas = {}
defense_columns_epas = {}

feats = [
    "rush_attempt",
    "pass_attempt",
    "extra_point_attempt",
    "two_point_attempt",
    "field_goal_attempt"
]

for column in feats:
    df = data.loc[data[column] == 1, :]\
    .groupby(['posteam', 'season', 'week'], as_index=False)['epa'].mean()

    if not df.empty:
        offense_columns_epas[column] = df

    df = data.loc[data[column] == 1, :]\
    .groupby(['defteam', 'season', 'week'], as_index=False)['epa'].mean()
    
    if not df.empty:
        defense_columns_epas[column] = df



In [80]:

#print(defense_columns_epas)
for column_epa in offense_columns_epas.keys():
    #print(column_epa)
    offense_columns_epas[column_epa]['epa_shifted'] = offense_columns_epas[column_epa].groupby('posteam')['epa'].shift()
    offense_columns_epas[column_epa]['ewma'] = offense_columns_epas[column_epa].groupby('posteam')['epa_shifted']\
    .transform(lambda x: x.ewm(min_periods=1, span=10).mean())
    offense_columns_epas[column_epa]['ewma_dynamic_window'] = offense_columns_epas[column_epa].groupby('posteam')\
    .apply(dynamic_window_ewma).values

for column_epa in defense_columns_epas.keys():
    #print(column_epa)
    defense_columns_epas[column_epa]['epa_shifted'] = defense_columns_epas[column_epa].groupby('defteam')['epa'].shift()
    defense_columns_epas[column_epa]['ewma'] = defense_columns_epas[column_epa].groupby('defteam')['epa_shifted']\
    .transform(lambda x: x.ewm(min_periods=1, span=10).mean())
    defense_columns_epas[column_epa]['ewma_dynamic_window'] = defense_columns_epas[column_epa].groupby('defteam')\
    .apply(dynamic_window_ewma).values


#print(offense_columns_epas)

offense_epa_stats = list(offense_columns_epas.values())
defense_epa_stats = list(defense_columns_epas.values())
suffixes = ['_'+suffix for suffix in list(offense_columns_epas.keys())]

offense_epa = offense_epa_stats[0]
defense_epa = defense_epa_stats[0]

for i in range(1,len(offense_epa_stats)):
    
    offense_epa = offense_epa.merge(offense_epa_stats[i], on=['posteam', 'season', 'week'], suffixes=(suffixes[i-1],suffixes[i]))
    defense_epa = defense_epa.merge(defense_epa_stats[i], on=['defteam', 'season', 'week'], suffixes=(suffixes[i-1],suffixes[i]))

offense_epa = offense_epa.rename(columns={'posteam':'team'})
defense_epa = defense_epa.rename(columns={'defteam':'team'})

print(offense_epa)

#offense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['posteam', 'season', 'week']), offense_epa_stats).rename(columns={'posteam':'team'})
#defense_epa = functools.reduce(lambda  left,right: pd.merge(left,right,on=['defteam', 'season', 'week']), defense_epa_stats).rename(columns={'defteam':'team'})


#Merge all the data together
'''offense_epa = rushing_offense_epa.merge(passing_offense_epa, on=['posteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'posteam': 'team'})
defense_epa = rushing_defense_epa.merge(passing_defense_epa, on=['defteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'defteam': 'team'})'''
epa = offense_epa.merge(defense_epa, on=['team', 'season', 'week'], suffixes=('_offense', '_defense'))

#remove the first season of data
epa = epa.loc[epa['season'] != epa['season'].unique()[0], :]

epa = epa.reset_index(drop=True)

epa.head()

    team  season  week  ...  epa_shifted      ewma  ewma_dynamic_window
0    ARI    2010    10  ...     0.064865  0.264402             0.264402
1    ARI    2011    16  ...     0.231351 -0.170829            -0.176848
2    ARI    2013     2  ...    -0.958428  0.218319             0.218319
3    ARI    2013     7  ...     1.108243  0.477642             0.477642
4    ARI    2014     2  ...     0.308515 -0.097081            -0.097081
..   ...     ...   ...  ...          ...       ...                  ...
767  WAS    2021     4  ...     0.914183  0.361677             0.361677
768  WAS    2021     5  ...     0.033577  0.302022             0.302022
769  WAS    2021    10  ...    -1.779663 -0.394075            -0.394075
770  WAS    2021    12  ...     0.231268 -0.143799            -0.118722
771  WAS    2021    18  ...     0.745096  0.217358             0.134104

[772 rows x 23 columns]


Unnamed: 0,team,season,week,...,epa_shifted_defense,ewma_defense,ewma_dynamic_window_defense
0,ARI,2021,17,...,-0.96388,0.131658,0.058044
1,ARI,2022,4,...,0.314635,0.165397,0.165397
2,ATL,2010,7,...,-2.241343,-1.415517,-1.415517
3,ATL,2015,5,...,-1.902007,-0.221148,-0.221148
4,ATL,2016,8,...,0.525372,0.371669,0.371669


In [81]:
schedule = data[['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score']]\
.drop_duplicates().reset_index(drop=True)\
.assign(home_team_win = lambda x: (x.home_score > x.away_score).astype(int))

df = schedule.merge(epa.rename(columns={'team': 'home_team'}), on=['home_team', 'season', 'week'])\
.merge(epa.rename(columns={'team': 'away_team'}), on=['away_team', 'season', 'week'], suffixes=('_home', '_away'))

df.head()

Unnamed: 0,season,week,home_team,...,epa_shifted_defense_away,ewma_defense_away,ewma_dynamic_window_defense_away
0,2010,7,ATL,...,0.24456,0.016897,0.016897
1,2010,8,DET,...,-1.086833,-0.714112,-0.714112
2,2010,10,PIT,...,0.750993,-0.529411,-0.529411
3,2011,19,SF,...,0.609457,0.336604,0.219642
4,2012,1,DEN,...,0.256308,-0.221363,-0.221363


In [82]:
target = 'home_team_win'
print(df.columns)
features = [column for column in df.columns if 'ewma' in column and 'dynamic' in column]
for feature in features:
  print(feature)

Index(['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score',
       'home_team_win', 'epa_rush_attempt_offense_home',
       'epa_shifted_rush_attempt_offense_home',
       'ewma_rush_attempt_offense_home',
       'ewma_dynamic_window_rush_attempt_offense_home',
       'epa_pass_attempt_offense_home',
       'epa_shifted_pass_attempt_offense_home',
       'ewma_pass_attempt_offense_home',
       'ewma_dynamic_window_pass_attempt_offense_home',
       'epa_extra_point_attempt_offense_home',
       'epa_shifted_extra_point_attempt_offense_home',
       'ewma_extra_point_attempt_offense_home',
       'ewma_dynamic_window_extra_point_attempt_offense_home',
       'epa_two_point_attempt_offense_home',
       'epa_shifted_two_point_attempt_offense_home',
       'ewma_two_point_attempt_offense_home',
       'ewma_dynamic_window_two_point_attempt_offense_home',
       'epa_offense_home', 'epa_shifted_offense_home', 'ewma_offense_home',
       'ewma_dynamic_window_offense_home

In [83]:
df = df.dropna()

X = df.loc[df['season'] != 2023, features].values
y = df.loc[df['season'] != 2023, target].values

print(df.loc[df['season'] != 2023])

clf = LogisticRegression()
clf.fit(X, y)

    season  week home_team  ... epa_shifted_defense_away  ewma_defense_away  \
3     2011    19        SF  ...                 0.609457           0.336604   
4     2012     1       DEN  ...                 0.256308          -0.221363   
5     2012     5       IND  ...                -0.911678          -0.225120   
6     2013    14        NE  ...                 0.467044           0.555306   
7     2015     5       BAL  ...                 0.871014           0.277530   
8     2015     5       ATL  ...                -2.797281          -0.323745   
9     2015    10       BAL  ...                 0.210393           0.357911   
10    2016     3        TB  ...                 0.047168           0.181137   
11    2016     7       MIA  ...                 0.939417           0.261813   
12    2016     8       ATL  ...                 0.540644           0.404419   
13    2016    10       TEN  ...                 0.114637           0.376762   
14    2016    14       PHI  ...                -1.51

In [84]:
accuracy_scores = cross_val_score(clf, X, y, cv=10)
log_losses = cross_val_score(clf, X, y, cv=10, scoring='neg_log_loss')

print('Model Accuracy:', np.mean(accuracy_scores))

Model Accuracy: 0.62


In [85]:
print('Neg log loss:', np.mean(log_losses))

Neg log loss: -0.6329380689939968


In [86]:
df_2020 = df.loc[(df['season'] == 2022)].assign(
    predicted_winner = lambda x: clf.predict(x[features]),
    home_team_win_probability = lambda x: clf.predict_proba(x[features])[:, 1]
)\
[['home_team', 'away_team', 'week', 'predicted_winner', 'home_team_win_probability', 'home_team_win']]

df_2020['actual_winner'] = df_2020.apply(lambda x: x.home_team if x.home_team_win else x.away_team, axis=1)
df_2020['predicted_winner'] = df_2020.apply(lambda x: x.home_team if x.predicted_winner == 1 else x.away_team, axis=1)
df_2020['win_probability'] = df_2020.apply(lambda x: x.home_team_win_probability if x.predicted_winner == x.home_team else 1 - x.home_team_win_probability, axis=1)
df_2020['correct_prediction'] = (df_2020['predicted_winner'] == df_2020['actual_winner']).astype(int)

df_2020 = df_2020.drop(columns=['home_team_win_probability', 'home_team_win'])

df_2020.sort_values(by='win_probability', ascending=False).reset_index(drop=True).head(10)



Unnamed: 0,home_team,away_team,week,predicted_winner,actual_winner,win_probability,correct_prediction
0,KC,LV,5,KC,KC,0.805575,1
1,MIN,CHI,5,MIN,MIN,0.719535,1
2,JAX,BAL,12,JAX,JAX,0.705429,1
3,SF,SEA,19,SF,SF,0.697367,1
4,BUF,CLE,11,BUF,BUF,0.69027,1
5,BUF,MIA,15,BUF,BUF,0.632519,1
6,TB,ATL,5,TB,TB,0.609699,1
7,CAR,ARI,4,CAR,ARI,0.565338,0
8,NO,SEA,5,SEA,NO,0.553041,0
9,MIA,MIN,6,MIN,MIN,0.543061,1


In [87]:
correct = df_2020.loc[df_2020['correct_prediction'] == 1].groupby('week')['correct_prediction'].sum()

num_games = df_2020.groupby('week')['correct_prediction'].size()

results = correct / num_games

results

week
4      NaN
5     0.75
6     1.00
11    1.00
12    1.00
15    0.50
19    1.00
Name: correct_prediction, dtype: float64

In [88]:
print(df_2020.loc[df_2020['week'] == results.idxmax()].sort_values(by='win_probability', ascending=False))

   home_team away_team  week predicted_winner actual_winner  win_probability  \
47       MIA       MIN     6              MIN           MIN         0.543061   

    correct_prediction  
47                   1  


In [89]:
df_2020.loc[df_2020['week'] > 17]

Unnamed: 0,home_team,away_team,week,predicted_winner,actual_winner,win_probability,correct_prediction
52,SF,SEA,19,SF,SF,0.697367,1


In [90]:
import itertools

def ewma(data, window):
    """
    Calculate the most recent value for EWMA given an array of data and a window size
    """
    alpha = 2 / (window + 1.0)
    alpha_rev = 1 - alpha
    scale = 1 / alpha_rev
    n = data.shape[0]
    r = np.arange(n)
    scale_arr = scale**r
    offset = data[0] * alpha_rev**(r+1)
    pw0 = alpha * alpha_rev**(n-1)
    mult = data * pw0 * scale_arr
    cumsums = mult.cumsum()
    out = offset + cumsums * scale_arr[::-1]
    return out[-1]

data_2020 = data.loc[(data['season'] == 2022)]
offense = data_2020.loc[(data_2020['posteam'] == 'KC') | (data_2020['posteam'] == 'TB')]
defense = data_2020.loc[(data_2020['defteam'] == 'KC') | (data_2020['defteam'] == 'TB')]

rushing_offense = offense.loc[offense['rush_attempt'] == 1]\
.groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})
passing_offense = offense.loc[offense['pass_attempt'] == 1]\
.groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})

extra_point_offense = passing_offense = offense.loc[offense['extra_point_attempt'] == 1]\
.groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})
two_point_offense = passing_offense = offense.loc[offense['two_point_attempt'] == 1]\
.groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})
field_goal_offense = passing_offense = offense.loc[offense['field_goal_attempt'] == 1]\
.groupby(['posteam', 'week'], as_index=False)['epa'].mean().rename(columns={'posteam': 'team'})

rushing_defense = defense.loc[defense['rush_attempt'] == 1]\
.groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})
passing_defense = defense.loc[defense['pass_attempt'] == 1]\
.groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})

extra_point_defense = defense.loc[defense['extra_point_attempt'] == 1]\
.groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})
two_point_defense = defense.loc[defense['two_point_attempt'] == 1]\
.groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})
field_goal_defense = defense.loc[defense['field_goal_attempt'] == 1]\
.groupby(['defteam', 'week'], as_index=False)['epa'].mean().rename(columns={'defteam': 'team'})



super_bowl_X = np.zeros(20)

for i, (tm, stat_df) in enumerate(itertools.product(['KC', 'TB'], [rushing_offense, passing_offense, extra_point_offense, two_point_offense, field_goal_offense ,rushing_defense, passing_defense, extra_point_defense, two_point_defense, field_goal_defense])):
    ewma_value = ewma(stat_df.loc[stat_df['team'] == tm]['epa'].values, 20)
    super_bowl_X[i] = ewma_value

predicted_winner = clf.predict(super_bowl_X.reshape(1, 20))[0]
predicted_proba = clf.predict_proba(super_bowl_X.reshape(1, 20))[0]

winner = 'KC' if predicted_winner else 'TB'
win_prob = predicted_proba[-1] if predicted_winner else predicted_proba[0]

print(f'Model predicts {winner} will win the Super Bowl and has a {round(win_prob*100, 2)}% win probability')

Model predicts TB will win the Super Bowl and has a 63.15% win probability
