In [1]:
# Import packages

import pandas as pd 
import numpy as np 
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

from joblib import dump, load

pd.set_option('display.max_columns', None)

  from pandas import MultiIndex, Int64Index


In [2]:
# stats = pd.read_csv('../../data/adjusted_value_models_combined_6_game_rolling.csv')
stats = pd.read_csv('../../pipeline/data/07_adjusted_value_models_aggregated.csv')
schedule = pd.read_csv('../../data/schedule_final.csv')
betting = pd.read_csv('../../data/betting_data_cleaned_with_returns.csv')
betting = betting.drop(columns=['Unnamed: 0'])

stats = stats.dropna()

In [3]:
nfl = schedule.copy()
nfl = nfl.merge(stats.add_suffix('_home'), how='left', left_on = ['season', 'week', 'home', 'home_qb_abv'], 
                right_on = ['season_home', 'week_home', 'team_full_home', 'qb_home'])
nfl = nfl.merge(stats.add_suffix('_away'), how='left', left_on = ['season', 'week', 'away', 'away_qb_abv'], 
                right_on = ['season_away', 'week_away', 'team_full_away', 'qb_away'])
nfl = nfl[['date', 'season', 'week', 'season_type', 'home', 'away', 'home_score', 'away_score', 'home_qb', 'away_qb',
          'passing_value_adjusted_home', 'rushing_value_adjusted_home', 'pass_def_value_adjusted_home', 'rush_def_value_adjusted_home', 'special_teams_value_home',
          'passing_value_adjusted_away', 'rushing_value_adjusted_away', 'pass_def_value_adjusted_away', 'rush_def_value_adjusted_away', 'special_teams_value_away',
          'total_possession_time_standardized_home', 'total_possession_time_standardized_away', 
          'total_plays_standardized_home', 'total_plays_standardized_away',
          'pass_percentage_standardized_home', 'pass_percentage_standardized_away']]
nfl.head()

Unnamed: 0,date,season,week,season_type,home,away,home_score,away_score,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,,,,,,
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,,,,,,
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,,,,,,
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,,,,,,
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,,,,,,


In [4]:
# Join in betting data to get point spreads

nfl_cleaned_with_betting = nfl.merge(betting, left_on=['date', 'home', 'away'],
                                            right_on=['date', 'home_team','away_team'])

In [5]:
# Remove any pushes, add binary label for home cover

nfl_cleaned_with_betting_final = nfl_cleaned_with_betting.copy()
nfl_cleaned_with_betting_final['home_cover'] = nfl_cleaned_with_betting_final.apply(
    lambda x: 1 if x.spread_cover_result == 'home' else 0, axis=1)
nfl_cleaned_with_betting_final = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.spread_cover_result != 'push']
nfl_cleaned_with_betting_final

Unnamed: 0,date,season,week,season_type,home,away,home_score_x,away_score_x,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away,home_team,away_team,home_score_y,away_score_y,total_score_actual,home_moneyline,away_moneyline,home_spread,away_spread,total_score_line,over_under_result,home_team_actual_line,away_team_actual_line,spread_cover_result,home_implied_prob,away_implied_prob,game_winner,favorite_return,underdog_return,home_return,away_return,favorite_spread_return,underdog_spread_return,home_spread_return,away_spread_return,over_return,under_return,home_cover
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,,,,,,,Seattle Seahawks,Green Bay Packers,36,16,52,-230.0,205.0,-4.5,4.5,46.5,over,-20,20,home,0.696970,0.327869,home,43.48,-100.0,43.48,-100.0,90.91,-100.00,90.91,-100.00,90.91,-100.00,1
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,,,,,,,Baltimore Ravens,Cincinnati Bengals,16,23,39,-113.0,102.0,-1.0,1.0,43.5,under,7,-7,away,0.530516,0.495050,away,-100.00,102.0,-100.00,102.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,,,,,,,Houston Texans,Washington Football Team,17,6,23,-177.0,159.0,-3.0,3.0,43.5,under,-11,11,home,0.638989,0.386100,home,56.50,-100.0,56.50,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,,,,,,,Chicago Bears,Buffalo Bills,20,23,43,-270.0,239.0,-7.0,7.0,47.0,under,3,-3,away,0.729730,0.294985,away,-100.00,239.0,-100.00,239.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,,,,,,,Pittsburgh Steelers,Cleveland Browns,30,27,57,-242.0,216.0,-5.5,5.5,41.5,over,-3,3,away,0.707602,0.316456,home,41.32,-100.0,41.32,-100.0,-100.00,90.91,-100.00,90.91,90.91,-100.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402,2023-01-22,2022,20,POST,Buffalo Bills,Cincinnati Bengals,10,27,Josh Allen,Joe Burrow,-0.151187,0.650933,0.483813,0.589479,-0.215688,0.002339,-0.152032,-0.140611,1.049708,0.204819,-0.153360,0.421317,-0.011858,0.274171,-0.007230,0.889427,Buffalo Bills,Cincinnati Bengals,10,27,37,-260.0,215.0,-6.0,6.0,48.5,under,17,-17,away,0.722222,0.317460,away,-100.00,215.0,-100.00,215.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
2403,2023-01-22,2022,20,POST,San Francisco 49ers,Dallas Cowboys,19,12,Brock Purdy,Dak Prescott,1.271880,0.634522,0.303955,0.348498,0.331705,0.563132,-0.743069,-0.104332,0.297954,-0.500316,0.342732,0.301391,-0.247412,0.812579,-0.969892,-0.382391,San Francisco 49ers,Dallas Cowboys,19,12,31,-200.0,170.0,-3.5,3.5,46.5,under,-7,7,home,0.666667,0.370370,home,50.00,-100.0,50.00,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1
2404,2023-01-29,2022,21,POST,Philadelphia Eagles,San Francisco 49ers,31,7,Jalen Hurts,Brock Purdy,0.131127,1.616761,0.742787,-0.131826,0.329906,,,,,,0.560610,,0.672368,,-0.941733,,Philadelphia Eagles,San Francisco 49ers,31,7,38,-155.0,135.0,-3.0,3.0,45.0,under,-24,24,home,0.607843,0.425532,home,64.52,-100.0,64.52,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1
2405,2023-01-29,2022,21,POST,Kansas City Chiefs,Cincinnati Bengals,23,20,Patrick Mahomes,Joe Burrow,0.474375,0.512161,0.781146,-0.771093,-0.504759,0.363199,0.164351,-0.080416,0.555210,0.215284,-0.141815,0.625788,-0.449315,0.520941,-0.018159,0.405253,Kansas City Chiefs,Cincinnati Bengals,23,20,43,-130.0,110.0,-2.0,2.0,48.5,under,-3,3,home,0.565217,0.476190,home,76.92,-100.0,76.92,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,1


In [6]:
nfl_cleaned_with_betting_final.to_csv('../../data/adjusted_stats_point_spread_modeling_base_df.csv')

In [7]:
# Split data into train and test sets, cross validation for hyperparameter tuning

train_df = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.season <= 2021]
train_df = train_df.dropna()

test_df = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.season == 2022]
test_df = test_df.dropna()

In [8]:
# NOTE
# Dont know if i need to balance this data set at all, point spread betting should be pretty 50/50


# Balance training data set

# Note: Can't balance data set and then do Cross Validation due to data leakage

# Fully balancing the data set doesnt work as well for prediction as the unbalanced, likely because the 
# home team does actually have an important advantage. But, maybe balancing the data set slightly will help

# balance_n = int(np.round((sum(train_df.home_win==1) - sum(train_df.home_win==0))/1.5))
# to_add = train_df[train_df.home_win==0].sample(n=balance_n, random_state=57, replace=True)

# train_df_balanced = pd.concat([train_df, to_add])
# train_df_balanced

In [9]:
# Shuffle training set before cross validation

train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,date,season,week,season_type,home,away,home_score_x,away_score_x,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away,home_team,away_team,home_score_y,away_score_y,total_score_actual,home_moneyline,away_moneyline,home_spread,away_spread,total_score_line,over_under_result,home_team_actual_line,away_team_actual_line,spread_cover_result,home_implied_prob,away_implied_prob,game_winner,favorite_return,underdog_return,home_return,away_return,favorite_spread_return,underdog_spread_return,home_spread_return,away_spread_return,over_return,under_return,home_cover
109,2014-10-26,2014,8,REG,New England Patriots,Chicago Bears,51,23,Tom Brady,Jay Cutler,0.545729,-0.229171,-0.303151,-0.428371,0.468593,-0.325933,0.180907,-0.407935,-0.348726,-0.089622,-0.706809,0.025225,-0.079159,-0.286671,0.327396,0.507505,New England Patriots,Chicago Bears,51,23,74,-260.0,231.0,-6.0,6.0,51.5,over,-28,28,home,0.722222,0.302115,home,38.46,-100.0,38.46,-100.0,90.91,-100.0,90.91,-100.0,90.91,-100.0,1
1130,2018-10-21,2018,7,REG,Chicago Bears,New England Patriots,31,38,Mitchell Trubisky,Tom Brady,0.733866,0.223499,0.349366,0.309202,-0.143926,0.175294,0.60386,-0.025431,0.068549,-0.339206,0.556327,0.237331,-0.056726,0.52655,-0.555793,-0.262945,Chicago Bears,New England Patriots,31,38,69,105.0,-125.0,1.5,-1.5,48.5,over,7,-7,away,0.487805,0.555556,away,80.0,-100.0,-100.0,80.0,90.91,-100.0,-100.0,90.91,90.91,-100.0,0
2063,2021-12-23,2021,16,REG,Tennessee Titans,San Francisco 49ers,20,17,Ryan Tannehill,Jimmy Garoppolo,-0.866203,0.126611,0.376509,0.392349,-0.495853,0.682361,0.640189,0.131382,0.480717,-0.81373,1.10363,0.364706,0.851838,-0.292279,-0.864855,-0.724905,Tennessee Titans,San Francisco 49ers,20,17,37,150.0,-172.0,3.5,-3.5,45.5,under,-3,3,home,0.4,0.632353,home,-100.0,150.0,150.0,-100.0,-100.0,90.91,90.91,-100.0,-100.0,90.91,1
2036,2021-12-12,2021,14,REG,Carolina Panthers,Atlanta Falcons,21,29,Cam Newton,Matt Ryan,-0.859637,-0.263685,0.061029,0.370571,-0.222273,-0.536184,-0.187227,-0.367921,0.267332,-0.416394,-1.334745,-0.18539,-0.684868,-0.482965,0.589054,0.236843,Carolina Panthers,Atlanta Falcons,21,29,50,-147.0,125.0,-2.5,2.5,41.5,over,8,-8,away,0.595142,0.444444,away,-100.0,125.0,-100.0,125.0,-100.0,90.91,-100.0,90.91,90.91,-100.0,0
969,2017-12-10,2017,14,REG,Denver Broncos,New York Jets,23,0,Trevor Siemian,Josh McCown,-1.076691,-0.465095,0.015986,0.582976,-0.990004,0.375078,-0.079633,-0.395611,0.119256,-0.482455,-0.533996,0.990408,0.038617,1.059349,0.670828,-0.641916,Denver Broncos,New York Jets,23,0,23,-106.0,-104.0,1.0,-1.0,40.5,under,-23,23,home,0.514563,0.509804,home,94.34,-100.0,94.34,-100.0,-100.0,90.91,90.91,-100.0,-100.0,90.91,1


In [19]:
# Feature Selection

feature_list = [
    'passing_value_adjusted_home'
    , 'rushing_value_adjusted_home'
    , 'pass_def_value_adjusted_home'
    , 'rush_def_value_adjusted_home'
    , 'special_teams_value_home'
    , 'passing_value_adjusted_away'
    , 'rushing_value_adjusted_away'
    , 'pass_def_value_adjusted_away'
    , 'rush_def_value_adjusted_away'
    , 'special_teams_value_away'
    , 'total_possession_time_standardized_home'
    , 'total_possession_time_standardized_away'
    , 'total_plays_standardized_home'
    , 'total_plays_standardized_away'
    , 'pass_percentage_standardized_home'
    , 'pass_percentage_standardized_away'
    , 'home_spread'
]

In [20]:
# Get features, labels for train, val, and test sets

# train_x = train_df_balanced[feature_list].to_numpy()
# train_y = train_df_balanced.home_win.to_numpy()

train_x = train_df_shuffled[feature_list].to_numpy()
train_y = train_df_shuffled.home_cover.to_numpy()

test_x = test_df[feature_list]
test_y = test_df.home_cover.to_numpy()

In [21]:
train_df_shuffled.week

109      8
1130     7
2063    16
2036    14
969     14
        ..
1353     4
1551    17
1036     1
1757    13
1349     4
Name: week, Length: 1776, dtype: int64

In [22]:
# Function to get predictions and probabilities for train, val, test sets

def get_preds(model, train_x, test_x):
    train_preds = model.predict(train_x)
    train_probs = model.predict_proba(train_x)
    
    test_preds = model.predict(test_x)
    test_probs = model.predict_proba(test_x)
    
    return train_preds, train_probs, test_preds, test_probs

# Function to get accuracy scores for train, val sets

def print_cv_results(model, train_x, train_y, cv_folds, verbose=True):
    cv_results = cross_validate(model, train_x, train_y, cv=5, return_train_score=True)
    
    train_scores = cv_results['train_score']
    val_scores = cv_results['test_score']
    
    if verbose:      
        for i, scores in enumerate(zip(train_scores, val_scores)):
            print('Fold {}, Train Accuracy: {}, Validation Accuracy: {}'.format(i+1,scores[0], scores[1]))     
        print()
        
    print('Average Training Accuracy: {}'.format(np.mean(train_scores)))
    print('Average Validation Accuracy: {}'.format(np.mean(val_scores)))
    
    return np.mean(train_scores), np.mean(val_scores)

In [23]:
# Simple Logistic Regression Model

lr_model = LogisticRegression(max_iter=100000)

print_cv_results(lr_model, train_x, train_y, 5)

Fold 1, Train Accuracy: 0.543661971830986, Validation Accuracy: 0.49719101123595505
Fold 2, Train Accuracy: 0.5622800844475722, Validation Accuracy: 0.5098591549295775
Fold 3, Train Accuracy: 0.5404644616467277, Validation Accuracy: 0.5183098591549296
Fold 4, Train Accuracy: 0.5573539760731879, Validation Accuracy: 0.5070422535211268
Fold 5, Train Accuracy: 0.5503166783954961, Validation Accuracy: 0.5183098591549296

Average Training Accuracy: 0.550815434478794
Average Validation Accuracy: 0.5101424275993036


(0.550815434478794, 0.5101424275993036)

In [24]:
# Default Random Forest Classifier with no hyperparameter tuning

rf_model = RandomForestClassifier()

print_cv_results(rf_model, train_x, train_y, 5)

Fold 1, Train Accuracy: 1.0, Validation Accuracy: 0.4887640449438202
Fold 2, Train Accuracy: 1.0, Validation Accuracy: 0.5014084507042254
Fold 3, Train Accuracy: 1.0, Validation Accuracy: 0.5352112676056338
Fold 4, Train Accuracy: 1.0, Validation Accuracy: 0.5295774647887324
Fold 5, Train Accuracy: 1.0, Validation Accuracy: 0.48169014084507045

Average Training Accuracy: 1.0
Average Validation Accuracy: 0.5073302737774965


(1.0, 0.5073302737774965)

In [26]:
# Default XGBoost Model with no hyperparameter tuning

import warnings
warnings.filterwarnings('ignore')

xgb_model = xgb.XGBClassifier(verbosity=0)

print_cv_results(xgb_model, train_x, train_y, 5)

Fold 1, Train Accuracy: 1.0, Validation Accuracy: 0.49719101123595505
Fold 2, Train Accuracy: 1.0, Validation Accuracy: 0.5098591549295775
Fold 3, Train Accuracy: 1.0, Validation Accuracy: 0.5464788732394367
Fold 4, Train Accuracy: 1.0, Validation Accuracy: 0.5098591549295775
Fold 5, Train Accuracy: 1.0, Validation Accuracy: 0.5267605633802817

Average Training Accuracy: 1.0
Average Validation Accuracy: 0.5180297515429656


(1.0, 0.5180297515429656)

In [27]:
# Logistic Regression, with hyperparameter tuning

def logistic_regression_tuning(train_x, train_y, verbose=True):
    
    lr_model = LogisticRegression()
    
    # Hyperparameters to tune
    penalty_list = ['none', 'l2', 'l1', 'elasticnet']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    
    params = dict(solver=solvers, penalty=penalty_list, C=c_values)
    
    clf = GridSearchCV(lr_model, params, return_train_score=True).fit(train_x, train_y)
    
    print('Best Result: {}'.format(clf.best_score_))
    print('Best Parameters: {}'.format(clf.best_params_))
    print()
    
    train_scores = clf.cv_results_['mean_train_score']
    val_scores = clf.cv_results_['mean_test_score']
    param_list = clf.cv_results_['params']
    
    if verbose:
        print('Parameter Combinations and Results:')
        for train_score, val_score, params in zip(train_scores, val_scores, param_list):
            print('Train Score: {}, Val Score: {}, Parameters: {}'.format(train_score, val_score, params))

In [28]:
# Can uncomment and run below to see, but hyperparameter tuning didnt change much. Can probably just roll with the 
# default logistic regression

# logistic_regression_tuning(train_x, train_y)

In [29]:
# Save best logistic regression model 

best_lr_model = LogisticRegression().fit(train_x, train_y)
dump(best_lr_model, 'saved_models/ps_logistic_regression_av.joblib')

['saved_models/ps_logistic_regression_av.joblib']

In [30]:
# Save best random forest model 

best_rf_model = RandomForestClassifier().fit(train_x, train_y)
dump(best_rf_model, 'saved_models/ps_random_forest_av.joblib')

['saved_models/ps_random_forest_av.joblib']

In [31]:
# Save best xg boost model 

best_xgb_model = xgb.XGBClassifier().fit(train_x, train_y)
dump(best_xgb_model, 'saved_models/ps_xg_boost_av.joblib')

['saved_models/ps_xg_boost_av.joblib']