In [5]:
import os
import pandas as  pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


### File Retrieval and Preprocessing

In [3]:
files = os.listdir('../final_stats/preprocessed_3/')
files[:2]

def split_target(df, target_col='away_winner_wts'):
    df = df.copy()
    target = df.pop(target_col)
    return df, target

def normalize_df(df):
    scaler = StandardScaler()
    data, target = split_target(df)
    data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    return data, target

def get_data(file):
    df = pd.read_csv(f'../final_stats/preprocessed_3/{file}', index_col=[0])
    data, target = normalize_df(df)
    return data, target


In [4]:
print(files[0])
x, y = get_data(files[0])
x.head()

preprocessed_no_sent_last_1.csv


Unnamed: 0,home_score_team,home_score_opp,home_pass_cmp_off,home_pass_att_off,home_pass_yds_off,home_pass_tds_off,home_sacks_off,home_sacks_yds_off,home_pass_qb_rate,home_rush_att_off,...,away_ravens,away_saints,away_seahawks,away_steelers,away_texans,away_titans,away_vikings,day_Sat,day_Sun,day_Thu
0,0.50196,-0.536497,0.435998,-0.491097,1.289698,1.442867,0.334398,0.102167,1.927174,0.487826,...,-0.199007,-0.199007,-0.171499,-0.199007,-0.199007,-0.139347,-0.171499,-0.454859,0.603023,-0.223607
1,0.092595,0.965694,1.0947,1.071484,1.239249,1.442867,-0.798241,-0.577745,1.175481,-0.284361,...,-0.199007,-0.199007,-0.171499,-0.199007,-0.199007,-0.139347,-0.171499,-0.454859,0.603023,-0.223607
2,1.730057,-0.536497,0.765349,0.029763,0.987003,1.442867,-0.798241,-0.728836,1.99235,0.616523,...,-0.199007,-0.199007,-0.171499,-0.199007,5.024938,-0.139347,-0.171499,-0.454859,0.603023,-0.223607
3,0.092595,-1.287593,-1.046082,-0.751527,-0.337289,-1.362708,-1.364561,-1.18211,-0.458258,1.517408,...,-0.199007,-0.199007,-0.171499,-0.199007,-0.199007,-0.139347,-0.171499,-0.454859,0.603023,-0.223607
4,-1.954234,-0.965694,-0.058029,0.029763,-0.61476,-1.362708,0.900718,0.857624,-1.618386,-1.44264,...,-0.199007,-0.199007,-0.171499,-0.199007,-0.199007,-0.139347,-0.171499,-0.454859,0.603023,-0.223607


### Repeated K-Fold Cross Validation
Limited in dataset so want to run a few times
Grid search of the learning rate and the n-estimators perameter
Hold out test set of 0.2 

In [52]:
def get_xgb_clf(depth, gamma, eta, l=1, a=0, s=1):
    return XGBClassifier(
        objective='binary:logistic',
        max_depth=depth, 
        gamma=gamma, 
        eta=eta, 
        reg_alpha=a, 
        reg_lambda=l,
        subsample=s,
        random_state=99
    )

def grid_search(f, etas, gammas, max_depths):
    X,y = get_data(f)
    max_score = {
        'scores_mean': 0

    }
    
    all_scores = []
    total_iter = len(etas)*len(gammas)*len(max_depths)
    curr_iter = 0
    for e in etas:
        for g in gammas:
            for depth in max_depths:
                curr_iter = curr_iter +1

                clf = get_xgb_clf(depth, g, e)

                cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)

                scores = cross_val_score(
                    clf, 
                    X, 
                    y, 
                    scoring='accuracy', 
                    cv=cv, 
                    n_jobs=-1
                )
                print(f'Iteration {curr_iter}/{total_iter} for {f}: {scores.mean()}')

                scores_dict = {
                    'scores_mean': scores.mean(),
                    'scores_std': scores.std(),
                    'eta': e,
                    'depth': depth,
                    'gamma': g,
                    'scores': scores,
                    'file': f
                }
                all_scores.append(scores_dict)

                if scores.mean() > max_score['scores_mean']:
                    max_score = scores_dict
    return all_scores, max_score


def grid_search2(f, etas, gammas, max_depths, lambdas, alphas, subsamples):
    X,y = get_data(f)
    max_score = {
        'scores_mean': 0

    }
    
    all_scores = []
    total_iter = len(etas)*len(gammas)*len(max_depths)*len(lambdas)*len(alphas)*len(subsamples)
    curr_iter = 0
    for l in lambdas:
        for a in alphas:
            for s in subsamples: 
                for e in etas:
                    for g in gammas:
                        for depth in max_depths:
                            curr_iter = curr_iter +1

                            clf = get_xgb_clf(depth, g, e, l, a, s)

                            cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)

                            scores = cross_val_score(
                                clf, 
                                X, 
                                y, 
                                scoring='accuracy', 
                                cv=cv, 
                                n_jobs=-1
                            )
                            print(f'Iteration {curr_iter}/{total_iter} for {f}: {scores.mean()}')

                            scores_dict = {
                                'scores_mean': scores.mean(),
                                'scores_std': scores.std(),
                                'eta': e,
                                'depth': depth,
                                'gamma': g,
                                'alpha': a,
                                'lambda': l,
                                'subsample': s,
                                'scores': scores,
                                'file': f
                            }
                            all_scores.append(scores_dict)

                            if scores.mean() > max_score['scores_mean']:
                                max_score = scores_dict
    return all_scores, max_score


In [31]:
gammas = [0, 0.3, 0.4, 0.5, 0.6, 0.75]
etas = [0.1, 0.25, 0.4, 0.5, 0.75, 0.8, 1]
max_depths = list(range(1,3,1))

x,y = get_data(files[0])

all_scores, max_score = grid_search(files[0], etas, gammas, max_depths)

Iteration 1/120 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 2/120 for preprocessed_no_sent_last_1.csv: 0.47872727272727283
Iteration 3/120 for preprocessed_no_sent_last_1.csv: 0.48745454545454536
Iteration 4/120 for preprocessed_no_sent_last_1.csv: 0.4818181818181817
Iteration 5/120 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 6/120 for preprocessed_no_sent_last_1.csv: 0.47872727272727283
Iteration 7/120 for preprocessed_no_sent_last_1.csv: 0.48490909090909085
Iteration 8/120 for preprocessed_no_sent_last_1.csv: 0.48954545454545445
Iteration 9/120 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 10/120 for preprocessed_no_sent_last_1.csv: 0.48254545454545467
Iteration 11/120 for preprocessed_no_sent_last_1.csv: 0.4846363636363636
Iteration 12/120 for preprocessed_no_sent_last_1.csv: 0.48754545454545456
Iteration 13/120 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 14/120 for preprocessed_no_sent_last_1.csv:

In [32]:
def get_best_params(all_scores):
    ms = max(all_scores, key=lambda x:x['scores_mean'])
    print(f"Accuracy: {round(ms['scores_mean']*100,2)}%")
    print(f"Best Paramaters: \neta: {ms['eta']} \t gamma: {ms['gamma']} \t depth: {ms['depth']}")

In [33]:
get_best_params(all_scores)

Accuracy: 53.77%
Best Paramaters: 
eta: 0.5 	 gamma: 3 	 depth: 1


Best results prelim grid search for no sent last 1 game  
Accuracy: 54.02%  
Best Paramaters:   
eta: 0.5  
gamma: 2   
depth: 1

In [34]:
no_sent_files = files[:7]
no_sent_files

['preprocessed_no_sent_last_1.csv',
 'preprocessed_no_sent_last_3.csv',
 'preprocessed_no_sent_last_3_wt.csv',
 'preprocessed_no_sent_last_7.csv',
 'preprocessed_no_sent_last_7_wt.csv',
 'preprocessed_no_sent_last_ssn.csv',
 'preprocessed_no_sent_last_ssn_wt.csv']

In [54]:

def all_datasets(files, gammas, etas, depths):
    results = []
    for f in files:
        x,y = get_data(f)
        all_scores, max_score = grid_search(f, etas, gammas, depths)
        results.append(max_score)
        df = pd.DataFrame.from_records([max_score], index = 'file')
        df.to_csv(f'../results/xgb/results_{f}')
    return results

def all_datasets_2(files, gammas, etas, depths, alphas, lambdas, subsamples):
    results = []
    for f in files:
        x,y = get_data(f)
        all_scores, max_score = grid_search2(f, etas, gammas, depths,lambdas, alphas, subsamples )
        results.append(max_score)
        df = pd.DataFrame.from_records([max_score], index = 'file')
        df.to_csv(f'../results/xgb/results_sent{f}')
    return results

In [44]:
gammas = [0, 0.5, 0.75, 1, 3, 5, 10]
etas = [0.01, 0.5, 0.1, 0.25, 0.5, 0.75, 1]
max_depths = list(range(1,3,1))
reg_lambdas = [0, 0.5, 1, 1.5, 2]
reg_alphas = [0, 0.5, 1, 1.5, 2]
subsample =[0.5, 0.8, 0.9, 1]

no_sent_results = all_datasets(no_sent_files, gammas, etas, max_depths)

Iteration 1/98 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 2/98 for preprocessed_no_sent_last_1.csv: 0.47872727272727283
Iteration 3/98 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 4/98 for preprocessed_no_sent_last_1.csv: 0.47872727272727283
Iteration 5/98 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 6/98 for preprocessed_no_sent_last_1.csv: 0.48163636363636364
Iteration 7/98 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 8/98 for preprocessed_no_sent_last_1.csv: 0.48254545454545467
Iteration 9/98 for preprocessed_no_sent_last_1.csv: 0.4613636363636364
Iteration 10/98 for preprocessed_no_sent_last_1.csv: 0.4740909090909091
Iteration 11/98 for preprocessed_no_sent_last_1.csv: 0.46218181818181825
Iteration 12/98 for preprocessed_no_sent_last_1.csv: 0.4385454545454545
Iteration 13/98 for preprocessed_no_sent_last_1.csv: 0.3758181818181818
Iteration 14/98 for preprocessed_no_sent_last_1.csv: 0.374
Iteration

In [45]:
df_no_sent_results = pd.DataFrame.from_records(no_sent_results, index ='file').sort_values('scores_mean', ascending=False)
df_no_sent_results

Unnamed: 0_level_0,scores_mean,scores_std,eta,depth,gamma,scores
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
preprocessed_no_sent_last_1.csv,0.539455,0.13747,0.75,1,3.0,"[0.45454545454545453, 0.7272727272727273, 0.54..."
preprocessed_no_sent_last_7.csv,0.521636,0.133558,1.0,1,0.5,"[0.45454545454545453, 0.5454545454545454, 0.63..."
preprocessed_no_sent_last_7_wt.csv,0.521636,0.133558,1.0,1,0.5,"[0.45454545454545453, 0.5454545454545454, 0.63..."
preprocessed_no_sent_last_ssn.csv,0.514909,0.132948,1.0,2,0.0,"[0.5454545454545454, 0.45454545454545453, 0.72..."
preprocessed_no_sent_last_ssn_wt.csv,0.514909,0.132948,1.0,2,0.0,"[0.5454545454545454, 0.45454545454545453, 0.72..."
preprocessed_no_sent_last_3.csv,0.510545,0.134531,0.5,2,1.0,"[0.5454545454545454, 0.45454545454545453, 0.54..."
preprocessed_no_sent_last_3_wt.csv,0.510545,0.134531,0.5,2,1.0,"[0.5454545454545454, 0.45454545454545453, 0.54..."


In [46]:
df_no_sent_results.to_csv('../results/xgb/xgb_result_no_sent.csv')

In [8]:
param_grid = {
    "max_depth": [1, 2, 3],
    "n_estimators": [100, 200, 300, 500],
    "learning_rate": [0.05, 0.01],
    "gamma": [0],
    "reg_lambda": [0, 5, 10, 20],
    "scale_pos_weight": [10, 12, 15],
    "colsample_bytree": [0.5, 0.8, 1.0],
}

In [47]:
files_test = [f for f in files if 'last_1' in f]
files_test

['preprocessed_no_sent_last_1.csv',
 'preprocessed_sent_24_last_1.csv',
 'preprocessed_sent_96_last_1.csv',
 'preprocessed_sent_cross_last_1.csv']

In [60]:
gammas = [0, 0.1, 0.3, 0.5]
etas = [0.01, 0.05, 0.1, 0.3]
max_depths = list(range(1,3,1))
reg_lambdas = [0, 0.25]
reg_alphas = [0, 0.25 ]
subsamples =[0.9, 1]

sent_results = all_datasets_2(files_test, gammas, etas, max_depths, reg_alphas, reg_lambdas, subsamples)

Iteration 1/256 for preprocessed_no_sent_last_1.csv: 0.4689090909090909
Iteration 2/256 for preprocessed_no_sent_last_1.csv: 0.46490909090909094
Iteration 3/256 for preprocessed_no_sent_last_1.csv: 0.4689090909090909
Iteration 4/256 for preprocessed_no_sent_last_1.csv: 0.46590909090909094
Iteration 5/256 for preprocessed_no_sent_last_1.csv: 0.4689090909090909
Iteration 6/256 for preprocessed_no_sent_last_1.csv: 0.46590909090909094
Iteration 7/256 for preprocessed_no_sent_last_1.csv: 0.4689090909090909
Iteration 8/256 for preprocessed_no_sent_last_1.csv: 0.4638181818181819
Iteration 9/256 for preprocessed_no_sent_last_1.csv: 0.5114545454545454
Iteration 10/256 for preprocessed_no_sent_last_1.csv: 0.5066363636363637
Iteration 11/256 for preprocessed_no_sent_last_1.csv: 0.5114545454545454
Iteration 12/256 for preprocessed_no_sent_last_1.csv: 0.5087272727272727
Iteration 13/256 for preprocessed_no_sent_last_1.csv: 0.5114545454545454
Iteration 14/256 for preprocessed_no_sent_last_1.csv: 0.5

In [61]:
df_sent_results = pd.DataFrame.from_records(sent_results, index ='file').sort_values('scores_mean', ascending=False)
df_sent_results

Unnamed: 0_level_0,scores_mean,scores_std,eta,depth,gamma,alpha,lambda,subsample,scores
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
preprocessed_sent_cross_last_1.csv,0.578818,0.149794,0.1,1,0.0,0.0,0.0,1,"[0.5454545454545454, 0.7272727272727273, 0.727..."
preprocessed_no_sent_last_1.csv,0.566909,0.142507,0.1,1,0.0,0.0,0.0,1,"[0.5454545454545454, 0.7272727272727273, 0.454..."
preprocessed_sent_24_last_1.csv,0.566,0.153916,0.05,2,0.3,0.25,0.0,1,"[0.45454545454545453, 0.7272727272727273, 0.81..."
preprocessed_sent_96_last_1.csv,0.544909,0.135591,0.1,1,0.0,0.0,0.25,1,"[0.5454545454545454, 0.6363636363636364, 0.454..."


In [None]:
df_sent_results.to_csv('../results/xgb/xgb_resust_test.csv')

### Colab Resurts
Best Score: 0.5643518518518519
Best Hyperparameters: {'scale_pos_weight': 8, 'reg_lambda': 0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}

In [None]:
final_cl = XGBClassifier(
    **grid_cv.best_params_,
    objective='binary_logistic',
    colsample_bytree=0.5
)