In [1]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneGroupOut
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [2]:
CURRENT_YEAR = 2022

In [3]:
nfl_season_df = pd.read_csv(os.path.join(os.getcwd(), 'nfl_season_data.csv'), index_col=0)
nfl_season_df

Unnamed: 0,season_n,team_name,season_n-1_epa,season_n-1_opposing_win_pct,season_n_opposing_exp_win_pct,season_n_wins
0,2010,ARI,0.232340,0.445312,0.482422,5.0
1,2010,ATL,0.320527,0.496094,0.488281,13.0
2,2010,BAL,1.245432,0.507812,0.488281,12.0
3,2010,BUF,-0.587669,0.500000,0.517578,4.0
4,2010,CAR,0.069489,0.476562,0.503906,2.0
...,...,...,...,...,...,...
404,2022,SEA,0.231417,0.517301,0.513841,
405,2022,SF,0.709444,0.532872,0.496540,
406,2022,TB,1.710975,0.534602,0.449827,
407,2022,TEN,0.503382,0.470588,0.506920,


In [43]:
current_season_df = nfl_season_df[nfl_season_df['season_n'] == CURRENT_YEAR].copy().reset_index(drop=True)

In [44]:
season_df = nfl_season_df[nfl_season_df['season_n'] != CURRENT_YEAR]

In [45]:
def makeXy(season_df):
    X = season_df[['season_n','season_n-1_epa','season_n-1_opposing_win_pct','season_n_opposing_exp_win_pct']]
    y = season_df[['season_n_wins']]
    return X,y

In [46]:
X, y = makeXy(season_df)
groups = season_df['season_n']

In [47]:
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)

12

In [48]:
print(logo)

LeaveOneGroupOut()


In [49]:
regr = RandomForestRegressor(max_depth=3, random_state=42)

In [50]:
cross_val_score(estimator=regr, X=X, y=np.ravel(y), groups=groups, cv=logo, scoring='neg_root_mean_squared_error')

array([-2.99980707, -2.75795282, -2.98747886, -3.00987722, -2.67594841,
       -3.00495491, -3.08340602, -2.9227122 , -2.49766768, -2.73669043,
       -3.05794404, -2.40056871])

In [51]:
regr.fit(X, np.ravel(y))

In [52]:
X, y = makeXy(current_season_df)

In [53]:
X

Unnamed: 0,season_n,season_n-1_epa,season_n-1_opposing_win_pct,season_n_opposing_exp_win_pct
0,2022,0.689629,0.543253,0.50692
1,2022,-1.454943,0.524221,0.474048
2,2022,-0.117606,0.474048,0.423875
3,2022,2.164865,0.512111,0.463668
4,2022,-0.979561,0.512111,0.467128
5,2022,-0.963155,0.470588,0.475779
6,2022,0.890307,0.536332,0.444637
7,2022,-0.316385,0.49481,0.482699
8,2022,1.632073,0.461938,0.486159
9,2022,0.10666,0.508651,0.5


In [54]:
predict_wins = regr.predict(X)

In [55]:
pd.Series(predict_wins)

0      9.237722
1      7.364873
2      8.645025
3     11.114890
4      7.499532
5      7.706402
6      9.504673
7      7.675924
8     11.050577
9      8.782864
10     7.599265
11     9.354028
12     7.577182
13     9.437521
14     6.909578
15     8.824848
16     9.259933
17     8.759587
18     6.232090
19     7.954378
20     8.713760
21    10.496387
22     9.144526
23     7.866512
24     7.353236
25     9.178561
26     7.177859
27     8.625381
28     9.343408
29    11.107127
30     9.115438
31     8.021411
dtype: float64

In [65]:
pd.concat([current_season_df, pd.Series(predict_wins, name='predicted_wins')],axis=1).sort_values('predicted_wins')

Unnamed: 0,season_n,team_name,season_n-1_epa,season_n-1_opposing_win_pct,season_n_opposing_exp_win_pct,season_n_wins,predicted_wins
18,2022,LV,-0.700951,0.527682,0.527682,,6.23209
14,2022,JAX,-2.085084,0.468858,0.49308,,6.909578
26,2022,PIT,-0.6324,0.512111,0.444637,,7.177859
24,2022,NYJ,-2.027713,0.49481,0.477509,,7.353236
1,2022,ATL,-1.454943,0.524221,0.474048,,7.364873
4,2022,CAR,-0.979561,0.512111,0.467128,,7.499532
12,2022,HOU,-1.73077,0.487889,0.484429,,7.577182
10,2022,DET,-1.448667,0.467128,0.472318,,7.599265
7,2022,CLE,-0.316385,0.49481,0.482699,,7.675924
5,2022,CHI,-0.963155,0.470588,0.475779,,7.706402
