In [1]:
# Import packages

import pandas as pd 
import numpy as np 
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

from joblib import dump, load

pd.set_option('display.max_columns', None)

  from pandas import MultiIndex, Int64Index


In [2]:
# stats = pd.read_csv('../../data/adjusted_value_models_combined_6_game_rolling.csv')
stats = pd.read_csv('../../pipeline/data/07_adjusted_value_models_aggregated.csv')
schedule = pd.read_csv('../../data/schedule_final.csv')
betting = pd.read_csv('../../data/betting_data_cleaned_with_returns.csv')
betting = betting.drop(columns=['Unnamed: 0'])

stats = stats.dropna()

In [4]:
nfl = schedule.copy()
nfl = nfl.merge(stats.add_suffix('_home'), how='left', left_on = ['season', 'week', 'home', 'home_qb_abv'], 
                right_on = ['season_home', 'week_home', 'team_full_home', 'qb_home'])
nfl = nfl.merge(stats.add_suffix('_away'), how='left', left_on = ['season', 'week', 'away', 'away_qb_abv'], 
                right_on = ['season_away', 'week_away', 'team_full_away', 'qb_away'])
nfl = nfl[['date', 'season', 'week', 'season_type', 'home', 'away', 'home_score', 'away_score', 'home_qb', 'away_qb',
          'passing_value_adjusted_home', 'rushing_value_adjusted_home', 'pass_def_value_adjusted_home', 'rush_def_value_adjusted_home', 'special_teams_value_home',
          'passing_value_adjusted_away', 'rushing_value_adjusted_away', 'pass_def_value_adjusted_away', 'rush_def_value_adjusted_away', 'special_teams_value_away',
          'total_possession_time_standardized_home', 'total_possession_time_standardized_away', 
          'total_plays_standardized_home', 'total_plays_standardized_away',
          'pass_percentage_standardized_home', 'pass_percentage_standardized_away']]
nfl.head()

Unnamed: 0,date,season,week,season_type,home,away,home_score,away_score,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,,,,,,
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,,,,,,
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,,,,,,
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,,,,,,
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,,,,,,


In [5]:
# Join in betting data to get point spreads

nfl_cleaned_with_betting = nfl.merge(betting, left_on=['date', 'home', 'away'],
                                            right_on=['date', 'home_team','away_team'])

In [6]:
# Remove any pushes, add binary label for home cover

nfl_cleaned_with_betting_final = nfl_cleaned_with_betting.copy()
nfl_cleaned_with_betting_final['over_hits'] = nfl_cleaned_with_betting_final.apply(
    lambda x: 1 if x.over_under_result == 'over' else 0, axis=1)
nfl_cleaned_with_betting_final = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.over_under_result != 'push']
nfl_cleaned_with_betting_final

Unnamed: 0,date,season,week,season_type,home,away,home_score_x,away_score_x,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away,home_team,away_team,home_score_y,away_score_y,total_score_actual,home_moneyline,away_moneyline,home_spread,away_spread,total_score_line,over_under_result,home_team_actual_line,away_team_actual_line,spread_cover_result,home_implied_prob,away_implied_prob,game_winner,favorite_return,underdog_return,home_return,away_return,favorite_spread_return,underdog_spread_return,home_spread_return,away_spread_return,over_return,under_return,over_hits
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,,,,,,,Seattle Seahawks,Green Bay Packers,36,16,52,-230.0,205.0,-4.5,4.5,46.5,over,-20,20,home,0.696970,0.327869,home,43.48,-100.0,43.48,-100.0,90.91,-100.00,90.91,-100.00,90.91,-100.00,1
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,,,,,,,Baltimore Ravens,Cincinnati Bengals,16,23,39,-113.0,102.0,-1.0,1.0,43.5,under,7,-7,away,0.530516,0.495050,away,-100.00,102.0,-100.00,102.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,,,,,,,Houston Texans,Washington Football Team,17,6,23,-177.0,159.0,-3.0,3.0,43.5,under,-11,11,home,0.638989,0.386100,home,56.50,-100.0,56.50,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,,,,,,,Chicago Bears,Buffalo Bills,20,23,43,-270.0,239.0,-7.0,7.0,47.0,under,3,-3,away,0.729730,0.294985,away,-100.00,239.0,-100.00,239.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,,,,,,,Pittsburgh Steelers,Cleveland Browns,30,27,57,-242.0,216.0,-5.5,5.5,41.5,over,-3,3,away,0.707602,0.316456,home,41.32,-100.0,41.32,-100.0,-100.00,90.91,-100.00,90.91,90.91,-100.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402,2023-01-22,2022,20,POST,Buffalo Bills,Cincinnati Bengals,10,27,Josh Allen,Joe Burrow,-0.151187,0.650933,0.483813,0.589479,-0.215688,0.002339,-0.152032,-0.140611,1.049708,0.204819,-0.153360,0.421317,-0.011858,0.274171,-0.007230,0.889427,Buffalo Bills,Cincinnati Bengals,10,27,37,-260.0,215.0,-6.0,6.0,48.5,under,17,-17,away,0.722222,0.317460,away,-100.00,215.0,-100.00,215.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0
2403,2023-01-22,2022,20,POST,San Francisco 49ers,Dallas Cowboys,19,12,Brock Purdy,Dak Prescott,1.271880,0.634522,0.303955,0.348498,0.331705,0.563132,-0.743069,-0.104332,0.297954,-0.500316,0.342732,0.301391,-0.247412,0.812579,-0.969892,-0.382391,San Francisco 49ers,Dallas Cowboys,19,12,31,-200.0,170.0,-3.5,3.5,46.5,under,-7,7,home,0.666667,0.370370,home,50.00,-100.0,50.00,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0
2404,2023-01-29,2022,21,POST,Philadelphia Eagles,San Francisco 49ers,31,7,Jalen Hurts,Brock Purdy,0.131127,1.616761,0.742787,-0.131826,0.329906,,,,,,0.560610,,0.672368,,-0.941733,,Philadelphia Eagles,San Francisco 49ers,31,7,38,-155.0,135.0,-3.0,3.0,45.0,under,-24,24,home,0.607843,0.425532,home,64.52,-100.0,64.52,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0
2405,2023-01-29,2022,21,POST,Kansas City Chiefs,Cincinnati Bengals,23,20,Patrick Mahomes,Joe Burrow,0.474375,0.512161,0.781146,-0.771093,-0.504759,0.363199,0.164351,-0.080416,0.555210,0.215284,-0.141815,0.625788,-0.449315,0.520941,-0.018159,0.405253,Kansas City Chiefs,Cincinnati Bengals,23,20,43,-130.0,110.0,-2.0,2.0,48.5,under,-3,3,home,0.565217,0.476190,home,76.92,-100.0,76.92,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0


In [7]:
# Standardize total score variable

total_score_mean = np.mean(nfl_cleaned_with_betting_final.total_score_line)
total_score_std = np.std(nfl_cleaned_with_betting_final.total_score_line)
nfl_cleaned_with_betting_final['total_score_standardized'] = nfl_cleaned_with_betting_final.apply(lambda x: (x.total_score_line - total_score_mean)/total_score_std, axis=1)
nfl_cleaned_with_betting_final

Unnamed: 0,date,season,week,season_type,home,away,home_score_x,away_score_x,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away,home_team,away_team,home_score_y,away_score_y,total_score_actual,home_moneyline,away_moneyline,home_spread,away_spread,total_score_line,over_under_result,home_team_actual_line,away_team_actual_line,spread_cover_result,home_implied_prob,away_implied_prob,game_winner,favorite_return,underdog_return,home_return,away_return,favorite_spread_return,underdog_spread_return,home_spread_return,away_spread_return,over_return,under_return,over_hits,total_score_standardized
0,2014-09-04,2014,1,REG,Seattle Seahawks,Green Bay Packers,36,16,Russell Wilson,Aaron Rodgers,,,,,,,,,,,,,,,,,Seattle Seahawks,Green Bay Packers,36,16,52,-230.0,205.0,-4.5,4.5,46.5,over,-20,20,home,0.696970,0.327869,home,43.48,-100.0,43.48,-100.0,90.91,-100.00,90.91,-100.00,90.91,-100.00,1,0.185545
1,2014-09-07,2014,1,REG,Baltimore Ravens,Cincinnati Bengals,16,23,Joe Flacco,Andy Dalton,,,,,,,,,,,,,,,,,Baltimore Ravens,Cincinnati Bengals,16,23,39,-113.0,102.0,-1.0,1.0,43.5,under,7,-7,away,0.530516,0.495050,away,-100.00,102.0,-100.00,102.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0,-0.514703
2,2014-09-07,2014,1,REG,Houston Texans,Washington Football Team,17,6,Ryan Fitzpatrick,Robert Griffin,,,,,,,,,,,,,,,,,Houston Texans,Washington Football Team,17,6,23,-177.0,159.0,-3.0,3.0,43.5,under,-11,11,home,0.638989,0.386100,home,56.50,-100.0,56.50,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0,-0.514703
3,2014-09-07,2014,1,REG,Chicago Bears,Buffalo Bills,20,23,Jay Cutler,EJ Manuel,,,,,,,,,,,,,,,,,Chicago Bears,Buffalo Bills,20,23,43,-270.0,239.0,-7.0,7.0,47.0,under,3,-3,away,0.729730,0.294985,away,-100.00,239.0,-100.00,239.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0,0.302253
4,2014-09-07,2014,1,REG,Pittsburgh Steelers,Cleveland Browns,30,27,Ben Roethlisberger,Brian Hoyer,,,,,,,,,,,,,,,,,Pittsburgh Steelers,Cleveland Browns,30,27,57,-242.0,216.0,-5.5,5.5,41.5,over,-3,3,away,0.707602,0.316456,home,41.32,-100.0,41.32,-100.0,-100.00,90.91,-100.00,90.91,90.91,-100.00,1,-0.981535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402,2023-01-22,2022,20,POST,Buffalo Bills,Cincinnati Bengals,10,27,Josh Allen,Joe Burrow,-0.151187,0.650933,0.483813,0.589479,-0.215688,0.002339,-0.152032,-0.140611,1.049708,0.204819,-0.153360,0.421317,-0.011858,0.274171,-0.007230,0.889427,Buffalo Bills,Cincinnati Bengals,10,27,37,-260.0,215.0,-6.0,6.0,48.5,under,17,-17,away,0.722222,0.317460,away,-100.00,215.0,-100.00,215.0,-100.00,90.91,-100.00,90.91,-100.00,90.91,0,0.652377
2403,2023-01-22,2022,20,POST,San Francisco 49ers,Dallas Cowboys,19,12,Brock Purdy,Dak Prescott,1.271880,0.634522,0.303955,0.348498,0.331705,0.563132,-0.743069,-0.104332,0.297954,-0.500316,0.342732,0.301391,-0.247412,0.812579,-0.969892,-0.382391,San Francisco 49ers,Dallas Cowboys,19,12,31,-200.0,170.0,-3.5,3.5,46.5,under,-7,7,home,0.666667,0.370370,home,50.00,-100.0,50.00,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0,0.185545
2404,2023-01-29,2022,21,POST,Philadelphia Eagles,San Francisco 49ers,31,7,Jalen Hurts,Brock Purdy,0.131127,1.616761,0.742787,-0.131826,0.329906,,,,,,0.560610,,0.672368,,-0.941733,,Philadelphia Eagles,San Francisco 49ers,31,7,38,-155.0,135.0,-3.0,3.0,45.0,under,-24,24,home,0.607843,0.425532,home,64.52,-100.0,64.52,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0,-0.164579
2405,2023-01-29,2022,21,POST,Kansas City Chiefs,Cincinnati Bengals,23,20,Patrick Mahomes,Joe Burrow,0.474375,0.512161,0.781146,-0.771093,-0.504759,0.363199,0.164351,-0.080416,0.555210,0.215284,-0.141815,0.625788,-0.449315,0.520941,-0.018159,0.405253,Kansas City Chiefs,Cincinnati Bengals,23,20,43,-130.0,110.0,-2.0,2.0,48.5,under,-3,3,home,0.565217,0.476190,home,76.92,-100.0,76.92,-100.0,90.91,-100.00,90.91,-100.00,-100.00,90.91,0,0.652377


In [8]:
nfl_cleaned_with_betting_final.to_csv('../../data/adjusted_stats_over_under_modeling_base_df.csv')

In [9]:
# Split data into train and test sets, cross validation for hyperparameter tuning

train_df = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.season <= 2021]
train_df = train_df.dropna()

test_df = nfl_cleaned_with_betting_final[nfl_cleaned_with_betting_final.season == 2022]
test_df = test_df.dropna()

In [10]:
# NOTE
# Dont know if i need to balance this data set at all, point spread betting should be pretty 50/50


# Balance training data set

# Note: Can't balance data set and then do Cross Validation due to data leakage

# Fully balancing the data set doesnt work as well for prediction as the unbalanced, likely because the 
# home team does actually have an important advantage. But, maybe balancing the data set slightly will help

# balance_n = int(np.round((sum(train_df.home_win==1) - sum(train_df.home_win==0))/1.5))
# to_add = train_df[train_df.home_win==0].sample(n=balance_n, random_state=57, replace=True)

# train_df_balanced = pd.concat([train_df, to_add])
# train_df_balanced

In [17]:
# Shuffle training set before cross validation

train_df_shuffled = train_df.sample(frac=1, random_state=57)
train_df_shuffled.head()

Unnamed: 0,date,season,week,season_type,home,away,home_score_x,away_score_x,home_qb,away_qb,passing_value_adjusted_home,rushing_value_adjusted_home,pass_def_value_adjusted_home,rush_def_value_adjusted_home,special_teams_value_home,passing_value_adjusted_away,rushing_value_adjusted_away,pass_def_value_adjusted_away,rush_def_value_adjusted_away,special_teams_value_away,total_possession_time_standardized_home,total_possession_time_standardized_away,total_plays_standardized_home,total_plays_standardized_away,pass_percentage_standardized_home,pass_percentage_standardized_away,home_team,away_team,home_score_y,away_score_y,total_score_actual,home_moneyline,away_moneyline,home_spread,away_spread,total_score_line,over_under_result,home_team_actual_line,away_team_actual_line,spread_cover_result,home_implied_prob,away_implied_prob,game_winner,favorite_return,underdog_return,home_return,away_return,favorite_spread_return,underdog_spread_return,home_spread_return,away_spread_return,over_return,under_return,over_hits,total_score_standardized
1748,2020-12-06,2020,13,REG,Atlanta Falcons,New Orleans Saints,16,21,Matt Ryan,Taysom Hill,-0.183432,-0.296139,0.916688,0.860224,0.421729,0.048574,0.457952,0.95007,0.665164,-0.068764,0.511634,0.749438,0.644326,-0.213761,0.179217,-1.607503,Atlanta Falcons,New Orleans Saints,16,21,37,130.0,-152.0,2.5,-2.5,46.0,under,5,-5,away,0.434783,0.603175,away,65.79,-100.0,-100.0,65.79,90.91,-100.0,-100.0,90.91,-100.0,90.91,0,0.068837
747,2016-12-26,2016,16,REG,Dallas Cowboys,Detroit Lions,42,21,Dak Prescott,Matthew Stafford,-0.398258,0.589516,0.391552,0.596753,0.316956,0.12474,-0.829658,-0.054792,0.101355,0.149473,0.154835,0.368244,-0.067942,-0.067942,-0.504604,0.585007,Dallas Cowboys,Detroit Lions,42,21,63,-279.0,246.0,-6.5,6.5,46.5,over,-21,21,home,0.736148,0.289017,home,35.84,-100.0,35.84,-100.0,90.91,-100.0,90.91,-100.0,90.91,-100.0,1,0.185545
1716,2020-11-16,2020,10,REG,Chicago Bears,Minnesota Vikings,13,19,Nick Foles,Kirk Cousins,-0.651132,-0.834892,0.216306,1.07583,0.243338,0.90195,0.397899,-0.347116,0.350242,-1.053845,0.309398,-0.478874,0.537766,-0.931638,1.057493,-1.270606,Chicago Bears,Minnesota Vikings,13,19,32,155.0,-175.0,3.0,-3.0,43.5,under,6,-6,away,0.392157,0.636364,away,57.14,-100.0,-100.0,57.14,90.91,-100.0,-100.0,90.91,-100.0,90.91,0,-0.514703
346,2015-10-25,2015,7,REG,Kansas City Chiefs,Pittsburgh Steelers,23,13,Alex Smith,Landry Jones,-0.035699,-0.592039,-0.553563,0.295488,-0.11279,2.853308,-0.14688,0.441935,0.752067,0.043853,-0.340512,-0.330643,-0.185719,-0.998939,0.61221,-1.647761,Kansas City Chiefs,Pittsburgh Steelers,23,13,36,-208.0,186.0,-4.0,4.0,41.5,under,-10,10,home,0.675325,0.34965,home,48.08,-100.0,48.08,-100.0,90.91,-100.0,90.91,-100.0,-100.0,90.91,0,-0.981535
439,2015-12-06,2015,13,REG,Pittsburgh Steelers,Indianapolis Colts,45,10,Ben Roethlisberger,Matt Hasselbeck,0.20989,0.867439,-0.60122,0.517635,-0.321733,0.418025,-1.302675,0.159478,0.545372,0.699043,0.054276,0.334166,0.790145,0.341472,1.277481,0.168159,Pittsburgh Steelers,Indianapolis Colts,45,10,55,-450.0,385.0,-10.0,10.0,50.5,over,-35,35,home,0.818182,0.206186,home,22.22,-100.0,22.22,-100.0,90.91,-100.0,90.91,-100.0,90.91,-100.0,1,1.119209


In [48]:
feature_list = [
    'passing_value_adjusted_home'
    , 'rushing_value_adjusted_home'
    , 'pass_def_value_adjusted_home'
    , 'rush_def_value_adjusted_home'
    , 'special_teams_value_home'
    , 'passing_value_adjusted_away'
    , 'rushing_value_adjusted_away'
    , 'pass_def_value_adjusted_away'
    , 'rush_def_value_adjusted_away'
    , 'special_teams_value_away'
    , 
    'total_possession_time_standardized_home'
    , 'total_possession_time_standardized_away'
    , 'total_plays_standardized_home'
    , 'total_plays_standardized_away'
    , 'pass_percentage_standardized_home'
    , 'pass_percentage_standardized_away'
    , 'total_score_line'
]

In [49]:
# Get features, labels for train, val, and test sets

# train_x = train_df_balanced[feature_list].to_numpy()
# train_y = train_df_balanced.home_win.to_numpy()

train_x = train_df_shuffled[feature_list].to_numpy()
train_y = train_df_shuffled.over_hits.to_numpy()

test_x = test_df[feature_list]
test_y = test_df.over_hits.to_numpy()

In [50]:
train_df_shuffled.week

1748    13
747     16
1716    10
346      7
439     13
        ..
1293    18
505     18
1212    13
2068    16
1152     8
Name: week, Length: 1801, dtype: int64

In [51]:
# Function to get predictions and probabilities for train, val, test sets

def get_preds(model, train_x, test_x):
    train_preds = model.predict(train_x)
    train_probs = model.predict_proba(train_x)
    
    test_preds = model.predict(test_x)
    test_probs = model.predict_proba(test_x)
    
    return train_preds, train_probs, test_preds, test_probs

# Function to get accuracy scores for train, val sets

def print_cv_results(model, train_x, train_y, cv_folds, verbose=True):
    cv_results = cross_validate(model, train_x, train_y, cv=5, return_train_score=True)
    
    train_scores = cv_results['train_score']
    val_scores = cv_results['test_score']
    
    if verbose:      
        for i, scores in enumerate(zip(train_scores, val_scores)):
            print('Fold {}, Train Accuracy: {}, Validation Accuracy: {}'.format(i+1,scores[0], scores[1]))     
        print()
        
    print('Average Training Accuracy: {}'.format(np.mean(train_scores)))
    print('Average Validation Accuracy: {}'.format(np.mean(val_scores)))
    
    return np.mean(train_scores), np.mean(val_scores)

In [52]:
# Simple Logistic Regression Model

lr_model = LogisticRegression()

print_cv_results(lr_model, train_x, train_y, 5)

Fold 1, Train Accuracy: 0.5416666666666666, Validation Accuracy: 0.5207756232686981
Fold 2, Train Accuracy: 0.5482303955586398, Validation Accuracy: 0.525
Fold 3, Train Accuracy: 0.546148507980569, Validation Accuracy: 0.49722222222222223
Fold 4, Train Accuracy: 0.5607217210270645, Validation Accuracy: 0.5
Fold 5, Train Accuracy: 0.5482303955586398, Validation Accuracy: 0.5

Average Training Accuracy: 0.548999537358316
Average Validation Accuracy: 0.5085995690981842


(0.548999537358316, 0.5085995690981842)

In [53]:
# Default Random Forest Classifier with no hyperparameter tuning

rf_model = RandomForestClassifier()

print_cv_results(rf_model, train_x, train_y, 5)

Fold 1, Train Accuracy: 1.0, Validation Accuracy: 0.5124653739612188
Fold 2, Train Accuracy: 1.0, Validation Accuracy: 0.525
Fold 3, Train Accuracy: 1.0, Validation Accuracy: 0.5527777777777778
Fold 4, Train Accuracy: 1.0, Validation Accuracy: 0.4888888888888889
Fold 5, Train Accuracy: 1.0, Validation Accuracy: 0.5194444444444445

Average Training Accuracy: 1.0
Average Validation Accuracy: 0.519715297014466


(1.0, 0.519715297014466)

In [54]:
# Default XGBoost Model with no hyperparameter tuning

import warnings
warnings.filterwarnings('ignore')

xgb_model = xgb.XGBClassifier(verbosity=0)

print_cv_results(xgb_model, train_x, train_y, 5)

Fold 1, Train Accuracy: 1.0, Validation Accuracy: 0.5734072022160664
Fold 2, Train Accuracy: 1.0, Validation Accuracy: 0.5444444444444444
Fold 3, Train Accuracy: 1.0, Validation Accuracy: 0.5416666666666666
Fold 4, Train Accuracy: 1.0, Validation Accuracy: 0.5083333333333333
Fold 5, Train Accuracy: 1.0, Validation Accuracy: 0.5583333333333333

Average Training Accuracy: 1.0
Average Validation Accuracy: 0.5452369959987688


(1.0, 0.5452369959987688)

In [28]:
# Logistic Regression, with hyperparameter tuning

def logistic_regression_tuning(train_x, train_y, verbose=True):
    
    lr_model = LogisticRegression()
    
    # Hyperparameters to tune
    penalty_list = ['none', 'l2', 'l1', 'elasticnet']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    
    params = dict(solver=solvers, penalty=penalty_list, C=c_values)
    
    clf = GridSearchCV(lr_model, params, return_train_score=True).fit(train_x, train_y)
    
    print('Best Result: {}'.format(clf.best_score_))
    print('Best Parameters: {}'.format(clf.best_params_))
    print()
    
    train_scores = clf.cv_results_['mean_train_score']
    val_scores = clf.cv_results_['mean_test_score']
    param_list = clf.cv_results_['params']
    
    if verbose:
        print('Parameter Combinations and Results:')
        for train_score, val_score, params in zip(train_scores, val_scores, param_list):
            print('Train Score: {}, Val Score: {}, Parameters: {}'.format(train_score, val_score, params))

In [29]:
# Can uncomment and run below to see, but hyperparameter tuning didnt change much. Can probably just roll with the 
# default logistic regression

# logistic_regression_tuning(train_x, train_y)

In [38]:
# Random Forest, with hyperparameter tuning

def random_forest_tuning(train_x, train_y, verbose=True):
    
    # Hyperparameters to tune
    estimators_list = [25, 50, 75, 100, 125, 150, 200, 300, 400, 500, 1000]
    criterions = ['gini', 'entropy', 'log_loss']
    depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
    
#     estimators_list = [25, 50, 75]
#     criterions = ['gini', 'entropy', 'log_loss']
#     depths = [1, 2, 3,]
    
    rf_model = RandomForestClassifier()
    
    params = dict(n_estimators=estimators_list, criterion=criterions, max_depth=depths)
    
    clf = GridSearchCV(rf_model, params, return_train_score=True).fit(train_x, train_y)
    
    print('Best Result: {}'.format(clf.best_score_))
    print('Best Parameters: {}'.format(clf.best_params_))
    print()
    
    train_scores = clf.cv_results_['mean_train_score']
    val_scores = clf.cv_results_['mean_test_score']
    param_list = clf.cv_results_['params']
    
    if verbose:
        print('Parameter Combinations and Results:')
        for train_score, val_score, params in zip(train_scores, val_scores, param_list):
            print('Train Score: {}, Val Score: {}, Parameters: {}'.format(train_score, val_score, params))

In [39]:
random_forest_tuning(train_x, train_y, verbose = True)

Best Result: 0.5291489689135118
Best Parameters: {'criterion': 'log_loss', 'max_depth': None, 'n_estimators': 1000}

Parameter Combinations and Results:
Train Score: 0.5664918266635824, Val Score: 0.4958248691905201, Parameters: {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 25}
Train Score: 0.5655201827434653, Val Score: 0.5013942751615882, Parameters: {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 50}
Train Score: 0.5791205952656334, Val Score: 0.48250692520775623, Parameters: {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 75}
Train Score: 0.5706554090523556, Val Score: 0.4925115420129271, Parameters: {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100}
Train Score: 0.5707915028144036, Val Score: 0.5047260695598645, Parameters: {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 125}
Train Score: 0.5688494679620634, Val Score: 0.4991582025238535, Parameters: {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 150}
Train Score: 0.578289478757036, Val Sco

In [62]:
# XG Boost, with hyperparameter tuning

def xg_boost_tuning(train_x, train_y, verbose=True):
    
    # Hyperparameters to tune
    etas = [.1, .2, .3, .5]
    depths = [0, 2, 3, 4, 5, 6, 7, 8, 10]
    gammas = [0, 1, 2, 3, 5]
    subsamples = [.25, .5, .75, 1]
    subsamples_bytree = [.25, .5, .75, 1]
    
#     estimators_list = [25, 50, 75]
#     criterions = ['gini', 'entropy', 'log_loss']
#     depths = [1, 2, 3,]
    
    xgb_model = xgb.XGBClassifier()
    
    params = dict(
#         eta=etas, 
#         gamma=gammas, 
#         max_depth=depths, 
        subsample=subsamples, 
        colsample_bytree=subsamples_bytree)
    
    clf = GridSearchCV(xgb_model, params, return_train_score=True).fit(train_x, train_y)
    
    print('Best Result: {}'.format(clf.best_score_))
    print('Best Parameters: {}'.format(clf.best_params_))
    print()
    
    train_scores = clf.cv_results_['mean_train_score']
    val_scores = clf.cv_results_['mean_test_score']
    param_list = clf.cv_results_['params']
    
    if verbose:
        print('Parameter Combinations and Results:')
        for train_score, val_score, params in zip(train_scores, val_scores, param_list):
            print('Train Score: {}, Val Score: {}, Parameters: {}'.format(train_score, val_score, params))

In [63]:
xg_boost_tuning(train_x, train_y, verbose = True)

Best Result: 0.5452369959987688
Best Parameters: {'colsample_bytree': 1, 'subsample': 1}

Parameter Combinations and Results:
Train Score: 0.9501662618551932, Val Score: 0.48915666358879656, Parameters: {'colsample_bytree': 0.25, 'subsample': 0.25}
Train Score: 1.0, Val Score: 0.5263527239150507, Parameters: {'colsample_bytree': 0.25, 'subsample': 0.5}
Train Score: 1.0, Val Score: 0.502488457987073, Parameters: {'colsample_bytree': 0.25, 'subsample': 0.75}
Train Score: 1.0, Val Score: 0.525831024930748, Parameters: {'colsample_bytree': 0.25, 'subsample': 1}
Train Score: 0.9655743503739688, Val Score: 0.5030563250230841, Parameters: {'colsample_bytree': 0.5, 'subsample': 0.25}
Train Score: 1.0, Val Score: 0.5252693136349645, Parameters: {'colsample_bytree': 0.5, 'subsample': 0.5}
Train Score: 1.0, Val Score: 0.5363604185903355, Parameters: {'colsample_bytree': 0.5, 'subsample': 0.75}
Train Score: 1.0, Val Score: 0.5402446906740536, Parameters: {'colsample_bytree': 0.5, 'subsample': 1}
T

In [30]:
# Save best logistic regression model 

best_lr_model = LogisticRegression().fit(train_x, train_y)
dump(best_lr_model, 'saved_models/ou_logistic_regression_av.joblib')

['saved_models/ou_logistic_regression_av.joblib']

In [40]:
# Save best random forest model 

best_rf_model = RandomForestClassifier(criterion='log_loss', max_depth=None, n_estimators=1000).fit(train_x, train_y)
dump(best_rf_model, 'saved_models/ou_random_forest_av.joblib')

['saved_models/ou_random_forest_av.joblib']

In [66]:
# Save best xg boost model 

best_xgb_model = xgb.XGBClassifier(eta=.3, gamma=0, max_depth=6, colsample_bytree=1).fit(train_x, train_y)
dump(best_xgb_model, 'saved_models/ou_xg_boost_av.joblib')

['saved_models/ou_xg_boost_av.joblib']