In [281]:
# Import packages

import pandas as pd 
import numpy as np 
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

from joblib import dump, load

pd.set_option('display.max_columns', None)

In [282]:
stats = pd.read_csv('../../pipeline/data/07_adjusted_value_models_aggregated.csv')
schedule = pd.read_csv('../../data/schedule_final.csv')
betting = pd.read_csv('../../data/betting_data_cleaned_with_returns.csv')
betting = betting.drop(columns=['Unnamed: 0'])

stats = stats.dropna()

In [283]:
nfl = schedule.copy()
nfl = nfl.merge(stats.add_suffix('_home'), how='left', left_on = ['season', 'week', 'home', 'home_qb_abv'], 
                right_on = ['season_home', 'week_home', 'team_full_home', 'qb_home'])
nfl = nfl.merge(stats.add_suffix('_away'), how='left', left_on = ['season', 'week', 'away', 'away_qb_abv'], 
                right_on = ['season_away', 'week_away', 'team_full_away', 'qb_away'])
nfl = nfl[['date', 'season', 'week', 'season_type', 'home', 'away', 'home_score', 'away_score', 'home_qb', 'away_qb',
          'passing_value_adjusted_home', 'rushing_value_adjusted_home', 'pass_def_value_adjusted_home', 'rush_def_value_adjusted_home', 'special_teams_value_home',
          'passing_value_adjusted_away', 'rushing_value_adjusted_away', 'pass_def_value_adjusted_away', 'rush_def_value_adjusted_away', 'special_teams_value_away',
          'total_possession_time_standardized_home', 'total_possession_time_standardized_away', 
          'total_plays_standardized_home', 'total_plays_standardized_away',
          'pass_percentage_standardized_home', 'pass_percentage_standardized_away']]
nfl.head()

Unnamed: 0,date,season,week,season_type,home,away,home_score,away_score,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,,,,,,
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,,,,,,
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,,,,,,
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,,,,,,
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,,,,,,


In [284]:
# Join in betting data to get point spreads

nfl_cleaned_with_betting = nfl.merge(betting, left_on=['date', 'home', 'away'],
                                            right_on=['date', 'home_team','away_team'])

In [285]:
# Remove any pushes, add binary label for home cover

nfl_cleaned_with_betting_final = nfl_cleaned_with_betting.copy()
nfl_cleaned_with_betting_final['home_cover'] = nfl_cleaned_with_betting_final.apply(
    lambda x: 1 if x.spread_cover_result == 'home' else 0, axis=1)
nfl_cleaned_with_betting_final = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.spread_cover_result != 'push']
nfl_cleaned_with_betting_final

Unnamed: 0,date,season,week,season_type,home,away,home_score_x,away_score_x,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away,home_team,away_team,home_score_y,away_score_y,total_score_actual,home_moneyline,away_moneyline,home_spread,away_spread,total_score_line,over_under_result,home_team_actual_line,away_team_actual_line,spread_cover_result,home_implied_prob,away_implied_prob,game_winner,favorite_return,underdog_return,home_return,away_return,favorite_spread_return,underdog_spread_return,home_spread_return,away_spread_return,over_return,under_return,home_cover
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,,,,,,,Seattle Seahawks,Green Bay Packers,36,16,52,-230.0,205.0,-4.5,4.5,46.5,over,-20,20,home,0.696970,0.327869,home,43.48,-100.0,43.48,-100.0,90.91,-100.00,90.91,-100.00,90.91,-100.00,1
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,,,,,,,Baltimore Ravens,Cincinnati Bengals,16,23,39,-113.0,102.0,-1.0,1.0,43.5,under,7,-7,away,0.530516,0.495050,away,-100.00,102.0,-100.00,102.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,,,,,,,Houston Texans,Washington Football Team,17,6,23,-177.0,159.0,-3.0,3.0,43.5,under,-11,11,home,0.638989,0.386100,home,56.50,-100.0,56.50,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,,,,,,,Chicago Bears,Buffalo Bills,20,23,43,-270.0,239.0,-7.0,7.0,47.0,under,3,-3,away,0.729730,0.294985,away,-100.00,239.0,-100.00,239.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,,,,,,,Pittsburgh Steelers,Cleveland Browns,30,27,57,-242.0,216.0,-5.5,5.5,41.5,over,-3,3,away,0.707602,0.316456,home,41.32,-100.0,41.32,-100.0,-100.00,90.91,-100.00,90.91,90.91,-100.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402,2023-01-22,2022,20,POST,Buffalo Bills,Cincinnati Bengals,10,27,Josh Allen,Joe Burrow,-0.151187,0.650933,0.483813,0.589479,-0.215688,0.002339,-0.152032,-0.140611,1.049708,0.204819,-0.153360,0.421317,-0.011858,0.274171,-0.007230,0.889427,Buffalo Bills,Cincinnati Bengals,10,27,37,-260.0,215.0,-6.0,6.0,48.5,under,17,-17,away,0.722222,0.317460,away,-100.00,215.0,-100.00,215.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
2403,2023-01-22,2022,20,POST,San Francisco 49ers,Dallas Cowboys,19,12,Brock Purdy,Dak Prescott,1.271880,0.634522,0.303955,0.348498,0.331705,0.563132,-0.743069,-0.104332,0.297954,-0.500316,0.342732,0.301391,-0.247412,0.812579,-0.969892,-0.382391,San Francisco 49ers,Dallas Cowboys,19,12,31,-200.0,170.0,-3.5,3.5,46.5,under,-7,7,home,0.666667,0.370370,home,50.00,-100.0,50.00,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1
2404,2023-01-29,2022,21,POST,Philadelphia Eagles,San Francisco 49ers,31,7,Jalen Hurts,Brock Purdy,0.131127,1.616761,0.742787,-0.131826,0.329906,,,,,,0.560610,,0.672368,,-0.941733,,Philadelphia Eagles,San Francisco 49ers,31,7,38,-155.0,135.0,-3.0,3.0,45.0,under,-24,24,home,0.607843,0.425532,home,64.52,-100.0,64.52,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1
2405,2023-01-29,2022,21,POST,Kansas City Chiefs,Cincinnati Bengals,23,20,Patrick Mahomes,Joe Burrow,0.474375,0.512161,0.781146,-0.771093,-0.504759,0.363199,0.164351,-0.080416,0.555210,0.215284,-0.141815,0.625788,-0.449315,0.520941,-0.018159,0.405253,Kansas City Chiefs,Cincinnati Bengals,23,20,43,-130.0,110.0,-2.0,2.0,48.5,under,-3,3,home,0.565217,0.476190,home,76.92,-100.0,76.92,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1


In [286]:
nfl_cleaned_with_betting_final.to_csv('../../data/adjusted_stats_point_spread_modeling_base_df.csv')

In [287]:
# Split data into train, validation, test sets

train_df = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.season <= 2020]
train_df = train_df.dropna()

val_df = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.season == 2021]
val_df = val_df.dropna()

test_df = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.season == 2022]
test_df = test_df.dropna()

In [288]:
# Feature Selection

feature_selection_list = [
    'passing_value_adjusted_home'
    , 'rushing_value_adjusted_home'
    , 'pass_def_value_adjusted_home'
    , 'rush_def_value_adjusted_home'
    , 'special_teams_value_home'
    , 'passing_value_adjusted_away'
    , 'rushing_value_adjusted_away'
    , 'pass_def_value_adjusted_away'
    , 'rush_def_value_adjusted_away'
    , 'special_teams_value_away'
    , 'total_possession_time_standardized_home'
    , 'total_possession_time_standardized_away'
    , 'total_plays_standardized_home'
    , 'total_plays_standardized_away'
    , 'pass_percentage_standardized_home'
    , 'pass_percentage_standardized_away'
    , 'home_spread'
    , 'home_cover'
    , 'total_score_line'
]
train_df[feature_selection_list].corr()

Unnamed: 0,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away,home_spread,home_cover,total_score_line
passing_value_adjusted_home,1.0,0.156529,0.073299,-0.078702,0.042031,0.01955,0.035015,0.031423,-0.034674,-0.004152,0.38549,0.003576,0.208186,0.038311,-0.197499,-0.005827,-0.375334,-0.036804,0.381442
rushing_value_adjusted_home,0.156529,1.0,0.041304,-0.079103,-0.001642,-0.002672,-0.034786,0.031916,0.006557,0.047439,0.177246,0.020678,0.068509,0.007773,-0.361668,-0.006779,-0.232185,0.033367,0.165522
pass_def_value_adjusted_home,0.073299,0.041304,1.0,0.087944,-0.014403,0.011351,0.011494,0.015483,0.001829,-0.016879,0.290015,-0.014612,0.147254,-0.04908,-0.343322,-0.016207,-0.2932,-0.015945,-0.146752
rush_def_value_adjusted_home,-0.078702,-0.079103,0.087944,1.0,-0.074807,-0.044363,-0.014961,0.083184,-0.018699,0.034611,0.097775,-0.003189,0.102699,0.000932,-0.052722,-0.01453,-0.070792,0.014147,-0.172015
special_teams_value_home,0.042031,-0.001642,-0.014403,-0.074807,1.0,0.039844,-0.0056,0.013268,0.014919,-4.9e-05,-0.0493,0.044632,-0.077613,0.020831,-0.084015,-0.018712,-0.033344,0.032572,0.070615
passing_value_adjusted_away,0.01955,-0.002672,0.011351,-0.044363,0.039844,1.0,0.124444,0.056451,-0.079233,0.041923,-0.015621,0.321683,-0.004109,0.155311,-0.003648,-0.20985,0.369219,0.016394,0.343823
rushing_value_adjusted_away,0.035015,-0.034786,0.011494,-0.014961,-0.0056,0.124444,1.0,-0.028091,-0.089545,-0.012108,0.036304,0.163807,0.021656,0.074214,0.000732,-0.31206,0.190494,-0.02149,0.16177
pass_def_value_adjusted_away,0.031423,0.031916,0.015483,0.083184,0.013268,0.056451,-0.028091,1.0,0.086563,0.025933,0.032008,0.299395,-0.00955,0.145638,-0.040561,-0.335235,0.236363,0.044345,-0.17414
rush_def_value_adjusted_away,-0.034674,0.006557,0.001829,-0.018699,0.014919,-0.079233,-0.089545,0.086563,1.0,-0.069973,-0.008624,0.10299,-0.00082,0.055615,-0.005572,-0.07347,0.067214,0.042288,-0.166029
special_teams_value_away,-0.004152,0.047439,-0.016879,0.034611,-4.9e-05,0.041923,-0.012108,0.025933,-0.069973,1.0,-0.004706,0.006248,0.013259,-0.060717,0.005172,-0.107985,0.083613,-0.003738,0.039592


In [290]:
feature_list = [
    'passing_value_adjusted_home'
    , 'rushing_value_adjusted_home'
    , 'pass_def_value_adjusted_home'
    , 'rush_def_value_adjusted_home'
    , 'special_teams_value_home'
    , 'passing_value_adjusted_away'
    , 'rushing_value_adjusted_away'
    , 'pass_def_value_adjusted_away'
    , 'rush_def_value_adjusted_away'
    , 'special_teams_value_away'
#     , 'total_possession_time_standardized_home'
#     , 'total_possession_time_standardized_away'
#     , 'total_plays_standardized_home'
#     , 'total_plays_standardized_away'
#     , 'pass_percentage_standardized_home'
#     , 'pass_percentage_standardized_away'
    , 'home_spread'
]

In [291]:
# Get features, labels for train, val, and test sets

train_x = train_df[feature_list].to_numpy()
train_y = train_df.home_cover.to_numpy()

val_x = val_df[feature_list].to_numpy()
val_y = val_df.home_cover.to_numpy()

test_x = test_df[feature_list].to_numpy()
test_y = test_df.home_cover.to_numpy()

In [292]:
# Logistic Regression Modeling

model = LogisticRegression().fit(train_x, train_y)

train_preds = model.predict(train_x)
train_probs = model.predict_proba(train_x)

val_preds = model.predict(val_x)
val_probs = model.predict_proba(val_x)

print('Training Accuracy: {}'.format(np.round(accuracy_score(train_preds, train_y), 3)))
print('Validation Accuracy: {}'.format(np.round(accuracy_score(val_preds, val_y), 3)))

Training Accuracy: 0.56
Validation Accuracy: 0.547


In [293]:
# Logistic Regression, with hyperparameter tuning

def logistic_regression_tuning(train_x, train_y, verbose=True):
    
    # Hyperparameters to tune
    penalty_list = ['none', 'l2', 'l1']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    
    train_acc_chosen_model = 0
    best_val_acc = 0
    
    for penalty in penalty_list:
        for c in c_values:
            for solver in solvers:
                
                if solver == 'newton-cg' and penalty in ['l1', 'elasticnet']:
                    pass
                elif solver == 'liblinear' and penalty in ['none', 'elasticnet']:
                    pass
                elif solver == 'lbfgs' and penalty in ['l1', 'elasticnet']:
                    pass
                elif solver == 'sag' and penalty in ['l1', 'elasticnet']:
                    pass
                
                else:
                
                    lr_model = LogisticRegression(penalty=penalty, solver=solver, C=c).fit(train_x, train_y)
                    train_acc = accuracy_score(lr_model.predict(train_x), train_y)
                    val_acc = accuracy_score(lr_model.predict(val_x), val_y)

                    if verbose:
                        print('Train Accuracy: {}, Validation Accuracy:{}'.format(train_acc, val_acc))

                    if val_acc > best_val_acc:
                        best_val_acc = val_acc
                        train_acc_chosen_model = train_acc
                        best_params = [penalty, c, solver]
                    
    print('Best Model:')
    print('Training Accuracy: {}'.format(train_acc_chosen_model))
    print('Validation Accuracy: {}'.format(best_val_acc))
    print('Best Parameters: {}'.format(best_params))
    
    return best_params

In [294]:
best_params_lr = logistic_regression_tuning(train_x, train_y, verbose = False)



Best Model:
Training Accuracy: 0.5251469627694317
Validation Accuracy: 0.5510204081632653
Best Parameters: ['l1', 0.1, 'saga']


In [295]:
# Save best logistic regression model 

best_penalty = best_params_lr[0]
best_c = best_params_lr[1]
best_solver = best_params_lr[2]

best_lr_model = LogisticRegression(penalty=best_penalty, C=best_c, solver=best_solver).fit(train_x, train_y)
best_lr_model = LogisticRegression().fit(train_x, train_y)
dump(best_lr_model, 'saved_models/ps_logistic_regression_av.joblib')

['saved_models/ps_logistic_regression_av.joblib']

In [296]:
# Random Forest Modeling

model = RandomForestClassifier().fit(train_x, train_y)

train_preds = model.predict(train_x)
train_probs = model.predict_proba(train_x)

val_preds = model.predict(val_x)
val_probs = model.predict_proba(val_x)

print('Training Accuracy: {}'.format(np.round(accuracy_score(train_preds, train_y), 3)))
print('Validation Accuracy: {}'.format(np.round(accuracy_score(val_preds, val_y), 3)))

Training Accuracy: 1.0
Validation Accuracy: 0.527


In [297]:
# Random Forest, with hyperparameter tuning

def random_forest_tuning(train_x, train_y, verbose=True):
    
    # Hyperparameters to tune
#     estimators_list = [150, 200, 300, 400, 500, 1000]
    estimators_list = [50, 100, 200, 500]
    criterions = ['gini', 'entropy', 'log_loss']
    depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
#     depths = [5, 6, 7, 8, 9, 10, None]
    
    train_acc_chosen_model = 0
    best_val_acc = 0
    
    for n_estimators in estimators_list:
        for criterion in criterions:
            for depth in depths:
                
                if False:
                    pass
                
                else:
                
                    rf_model = RandomForestClassifier(
                        n_estimators=n_estimators, criterion=criterion, max_depth=depth).fit(train_x, train_y)
                    train_acc = accuracy_score(rf_model.predict(train_x), train_y)
                    val_acc = accuracy_score(rf_model.predict(val_x), val_y)

                    if verbose:
                        print('Train Accuracy: {}, Validation Accuracy:{}, Params: {}'.format(
                            train_acc, val_acc, [n_estimators, criterion, depth]))

                    if val_acc > best_val_acc:
                        best_val_acc = val_acc
                        train_acc_chosen_model = train_acc
                        best_params = [n_estimators, criterion, depth]
                        best_model = rf_model
                    
    print('Best Model:')
    print('Training Accuracy: {}'.format(train_acc_chosen_model))
    print('Validation Accuracy: {}'.format(best_val_acc))
    print('Best Parameters: {}'.format(best_params))
    
    return best_params, best_model

In [298]:
best_params_rf, best_rf_model = random_forest_tuning(train_x, train_y, verbose = True)

Train Accuracy: 0.5453951665578054, Validation Accuracy:0.5224489795918368, Params: [50, 'gini', 1]
Train Accuracy: 0.6002612671456564, Validation Accuracy:0.5265306122448979, Params: [50, 'gini', 2]
Train Accuracy: 0.6329196603527106, Validation Accuracy:0.5183673469387755, Params: [50, 'gini', 3]
Train Accuracy: 0.6903984323971261, Validation Accuracy:0.5102040816326531, Params: [50, 'gini', 4]
Train Accuracy: 0.7491835401698237, Validation Accuracy:0.5224489795918368, Params: [50, 'gini', 5]
Train Accuracy: 0.8131939908556499, Validation Accuracy:0.5142857142857142, Params: [50, 'gini', 6]
Train Accuracy: 0.8634879163945134, Validation Accuracy:0.5428571428571428, Params: [50, 'gini', 7]
Train Accuracy: 0.9229261920313521, Validation Accuracy:0.49795918367346936, Params: [50, 'gini', 8]
Train Accuracy: 0.9451338994121489, Validation Accuracy:0.5020408163265306, Params: [50, 'gini', 9]
Train Accuracy: 0.9634225996080993, Validation Accuracy:0.5591836734693878, Params: [50, 'gini', 10

Train Accuracy: 0.7328543435662965, Validation Accuracy:0.5102040816326531, Params: [200, 'entropy', 5]
Train Accuracy: 0.8131939908556499, Validation Accuracy:0.5102040816326531, Params: [200, 'entropy', 6]
Train Accuracy: 0.8706727629000653, Validation Accuracy:0.5183673469387755, Params: [200, 'entropy', 7]
Train Accuracy: 0.9085564990202482, Validation Accuracy:0.5265306122448979, Params: [200, 'entropy', 8]
Train Accuracy: 0.9366427171783148, Validation Accuracy:0.5061224489795918, Params: [200, 'entropy', 9]
Train Accuracy: 0.9588504245591117, Validation Accuracy:0.5428571428571428, Params: [200, 'entropy', 10]
Train Accuracy: 1.0, Validation Accuracy:0.5061224489795918, Params: [200, 'entropy', None]
Train Accuracy: 0.5382103200522534, Validation Accuracy:0.5224489795918368, Params: [200, 'log_loss', 1]
Train Accuracy: 0.5826257348138472, Validation Accuracy:0.5061224489795918, Params: [200, 'log_loss', 2]
Train Accuracy: 0.6355323318092749, Validation Accuracy:0.493877551020408

In [299]:
# Save best random forest model 

best_n_estimators = best_params_lr[0]
best_criterion = best_params_lr[1]
best_depth = best_params_lr[2]

dump(best_rf_model, 'saved_models/ps_random_forest_av.joblib')

['saved_models/ps_random_forest_av.joblib']

In [300]:
sum(best_rf_model.predict(test_x))

77

In [301]:
best_rf_model.n_estimators

50