In [1]:
# Import packages

import pandas as pd 
import numpy as np 
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

from joblib import dump, load

pd.set_option('display.max_columns', None)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# stats = pd.read_csv('../../data/adjusted_value_models_combined_6_game_rolling.csv')
stats = pd.read_csv('../../pipeline/data/07_adjusted_value_models_aggregated.csv')
schedule = pd.read_csv('../../data/schedule_final.csv')

stats = stats.dropna()

In [3]:
nfl = schedule.copy()
nfl = nfl.merge(stats.add_suffix('_home'), how='left', left_on = ['season', 'week', 'home', 'home_qb_abv'], 
                right_on = ['season_home', 'week_home', 'team_full_home', 'qb_home'])
nfl = nfl.merge(stats.add_suffix('_away'), how='left', left_on = ['season', 'week', 'away', 'away_qb_abv'], 
                right_on = ['season_away', 'week_away', 'team_full_away', 'qb_away'])
# nfl = nfl[['date', 'season', 'week', 'season_type', 'home', 'away', 'home_score', 'away_score', 'home_qb', 'away_qb',
#           'qb_adjusted_value_home', 'rushing_adjusted_value_home', 'qb_rushing_value_pct_home', 'qb_def_adjusted_value_home', 'rush_def_adjusted_value_home', 'special_teams_value_home',
#           'qb_adjusted_value_away', 'rushing_adjusted_value_away', 'qb_rushing_value_pct_away', 'qb_def_adjusted_value_away', 'rush_def_adjusted_value_away', 'special_teams_value_away']]
# nfl.head()
nfl = nfl[['date', 'season', 'week', 'season_type', 'home', 'away', 'home_score', 'away_score', 'home_qb', 'away_qb',
          'passing_value_adjusted_home', 'rushing_value_adjusted_home', 'pass_def_value_adjusted_home', 'rush_def_value_adjusted_home', 'special_teams_value_home',
          'passing_value_adjusted_away', 'rushing_value_adjusted_away', 'pass_def_value_adjusted_away', 'rush_def_value_adjusted_away', 'special_teams_value_away']]
nfl.head()

Unnamed: 0,date,season,week,season_type,home,away,home_score,away_score,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,


In [4]:
# Remove any ties, add binary label for home win

nfl_cleaned = nfl.copy()
nfl_cleaned = nfl_cleaned[nfl_cleaned.home_score != nfl_cleaned.away_score]
nfl_cleaned['home_win'] = nfl_cleaned.apply(lambda x: 1 if x.home_score>x.away_score else 0, axis=1)
nfl_cleaned

Unnamed: 0,date,season,week,season_type,home,away,home_score,away_score,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,home_win
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,1
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,0
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,1
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,0
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2433,2023-01-22,2022,20,POST,Buffalo Bills,Cincinnati Bengals,10,27,Josh Allen,Joe Burrow,-0.151187,0.650933,0.483813,0.589479,-0.215688,0.002339,-0.152032,-0.140611,1.049708,0.204819,0
2434,2023-01-22,2022,20,POST,San Francisco 49ers,Dallas Cowboys,19,12,Brock Purdy,Dak Prescott,1.271880,0.634522,0.303955,0.348498,0.331705,0.563132,-0.743069,-0.104332,0.297954,-0.500316,1
2435,2023-01-29,2022,21,POST,Philadelphia Eagles,San Francisco 49ers,31,7,Jalen Hurts,Brock Purdy,0.131127,1.616761,0.742787,-0.131826,0.329906,,,,,,1
2436,2023-01-29,2022,21,POST,Kansas City Chiefs,Cincinnati Bengals,23,20,Patrick Mahomes,Joe Burrow,0.474375,0.512161,0.781146,-0.771093,-0.504759,0.363199,0.164351,-0.080416,0.555210,0.215284,1


In [5]:
nfl_cleaned.to_csv('../../data/adjusted_stats_modeling_base_df.csv')

In [6]:
# Split data into train, validation, test sets

train_df = nfl_cleaned[nfl_cleaned.season <= 2020]
train_df = train_df.dropna()

val_df = nfl_cleaned[nfl_cleaned.season == 2021]
val_df = val_df.dropna()

test_df = nfl_cleaned[nfl_cleaned.season == 2022]
test_df = test_df.dropna()

In [7]:
# Balance training data set

# Note: Can't balance data set and then do Cross Validation due to data leakage

# Fully balancing the data set doesnt work as well for prediction as the unbalanced, likely because the 
# home team does actually have an important advantage. But, maybe balancing the data set slightly will help

balance_n = int(np.round((sum(train_df.home_win==1) - sum(train_df.home_win==0))/1.5))
to_add = train_df[train_df.home_win==0].sample(n=balance_n, random_state=57, replace=True)

train_df_balanced = pd.concat([train_df, to_add])
train_df_balanced

Unnamed: 0,date,season,week,season_type,home,away,home_score,away_score,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,home_win
33,2014-09-21,2014,3,REG,New England Patriots,Las Vegas Raiders,16,9,Tom Brady,Derek Carr,0.387129,-0.120190,1.178166,0.163060,1.176841,-0.702046,0.774141,-0.954522,-0.448204,-0.351268,1
36,2014-09-21,2014,3,REG,Cleveland Browns,Baltimore Ravens,21,23,Brian Hoyer,Joe Flacco,-0.137203,0.141640,0.192438,-0.869159,-0.162922,0.388264,0.048372,0.786102,-0.208942,0.137405,0
37,2014-09-21,2014,3,REG,Carolina Panthers,Pittsburgh Steelers,19,37,Cam Newton,Ben Roethlisberger,0.604152,-0.878958,0.309284,0.344995,1.301579,-0.782914,0.355480,-0.482694,-0.196212,0.017013,0
38,2014-09-21,2014,3,REG,Arizona Cardinals,San Francisco 49ers,23,14,Drew Stanton,Colin Kaepernick,-0.857878,0.359969,-0.160789,0.607663,1.173452,-0.287031,-0.471071,-0.239934,0.452426,0.402579,1
39,2014-09-21,2014,3,REG,Buffalo Bills,Los Angeles Chargers,10,22,EJ Manuel,Philip Rivers,0.531544,-0.912222,0.256764,0.606452,1.562575,0.804905,-0.212663,-0.100618,-0.538650,1.051236,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,2015-11-22,2015,11,REG,Atlanta Falcons,Indianapolis Colts,21,24,Matt Ryan,Matt Hasselbeck,0.026796,-1.132059,0.050628,0.340127,0.375407,0.628201,-1.181470,-0.071411,0.349349,-0.163109,0
83,2014-10-12,2014,6,REG,Tampa Bay Buccaneers,Baltimore Ravens,17,48,Mike Glennon,Joe Flacco,0.333736,-0.929897,-0.754198,0.505290,0.016402,0.142389,0.138675,-0.461865,1.188170,0.501179,0
48,2014-09-25,2014,4,REG,Washington Football Team,New York Giants,14,45,Kirk Cousins,Eli Manning,1.117020,0.304661,0.232412,0.919063,-1.316196,0.089035,-0.247808,0.339505,0.162678,-0.322892,0
1814,2020-12-20,2020,15,REG,Minnesota Vikings,Chicago Bears,27,33,Kirk Cousins,Mitch Trubisky,0.037541,0.141796,-0.279284,-0.237743,-0.958314,0.194658,0.977991,-0.271941,0.131815,0.634002,0


In [8]:
feature_list =['passing_value_adjusted_home', 'rushing_value_adjusted_home', 'pass_def_value_adjusted_home', 'rush_def_value_adjusted_home', 'special_teams_value_home',
          'passing_value_adjusted_away', 'rushing_value_adjusted_away', 'pass_def_value_adjusted_away', 'rush_def_value_adjusted_away', 'special_teams_value_away']

In [9]:
# Get features, labels for train, val, and test sets

train_x = train_df_balanced[feature_list].to_numpy()
train_y = train_df_balanced.home_win.to_numpy()

val_x = val_df[feature_list].to_numpy()
val_y = val_df.home_win.to_numpy()

test_x = test_df[feature_list]
test_y = test_df.home_win.to_numpy()

In [16]:
# Logistic Regression Modeling

model = LogisticRegression().fit(train_x, train_y)

train_preds = model.predict(train_x)
train_probs = model.predict_proba(train_x)

val_preds = model.predict(val_x)
val_probs = model.predict_proba(val_x)

print('Training Accuracy: {}'.format(np.round(accuracy_score(train_preds, train_y), 3)))
print('Validation Accuracy: {}'.format(np.round(accuracy_score(val_preds, val_y), 3)))

Training Accuracy: 0.622
Validation Accuracy: 0.591


In [54]:
# Logistic Regression, with hyperparameter tuning

def logistic_regression_tuning(train_x, train_y, verbose=True):
    
    # Hyperparameters to tune
    penalty_list = ['none', 'l2', 'l1']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    
    train_acc_chosen_model = 0
    best_val_acc = 0
    
    for penalty in penalty_list:
        for c in c_values:
            for solver in solvers:
                
                if solver == 'newton-cg' and penalty in ['l1', 'elasticnet']:
                    pass
                elif solver == 'liblinear' and penalty in ['none', 'elasticnet']:
                    pass
                elif solver == 'lbfgs' and penalty in ['l1', 'elasticnet']:
                    pass
                elif solver == 'sag' and penalty in ['l1', 'elasticnet']:
                    pass
                
                else:
                
                    lr_model = LogisticRegression(penalty=penalty, solver=solver, C=c).fit(train_x, train_y)
                    train_acc = accuracy_score(lr_model.predict(train_x), train_y)
                    val_acc = accuracy_score(lr_model.predict(val_x), val_y)

                    if verbose:
                        print('Train Accuracy: {}, Validation Accuracy:{}'.format(train_acc, val_acc))

                    if val_acc > best_val_acc:
                        best_val_acc = val_acc
                        train_acc_chosen_model = train_acc
                        best_params = [penalty, c, solver]
                    
    print('Best Model:')
    print('Training Accuracy: {}'.format(train_acc_chosen_model))
    print('Validation Accuracy: {}'.format(best_val_acc))
    print('Best Parameters: {}'.format(best_params))
    
    return best_params

In [55]:
best_params_lr = logistic_regression_tuning(train_x, train_y, verbose = False)



Best Model:
Training Accuracy: 0.6155619596541787
Validation Accuracy: 0.5991902834008097
Best Parameters: ['l1', 0.1, 'liblinear']


In [56]:
# Save best logistic regression model 

best_penalty = best_params_lr[0]
best_c = best_params_lr[1]
best_solver = best_params_lr[2]

best_lr_model = LogisticRegression(penalty=best_penalty, C=best_c, solver=best_solver).fit(train_x, train_y)
dump(best_lr_model, 'saved_models/logistic_regression_av.joblib')

['saved_models/logistic_regression_av.joblib']