In [168]:
import pickle
from typing import List, Dict
import pandas as pd
import numpy as np
import xgboost
pd.set_option('display.max_columns', None)

### 1. Load dataset, create some new variables and inspect

In [207]:
""" 
==============
A. Prep dataset
==============
"""

# 1. Load saved datset
with open('../data/my_data/ds1.p', 'rb') as handle:
    df = pickle.load(handle)

# 2. Create extra fields
rs = ['DOK', 'MAS', 'MOR', 'PGH', 'POM', 'SAG']
df['tm1win'] = np.where(df['Tm1Score'] > df['Tm2Score'], 1, 0)
df['tm1home'] = df['Tm1Loc'].replace({'H' : 1, 'N' : 0.5, 'A' : 0})

# 3. Quickly look at subset of fields
viewme = df[['Season','DayNum','Tm1ID','Tm2ID','TeamName_x','TeamName_y','gmCity','gmState']
    + ['dist_tm1','dist_tm2','tm1home']
    + [i + '_tm1' for i in rs] 
    + [i + '_tm2' for i in rs]
    + ['Tm1Score','Tm2Score','tm1win']]
viewme.sample(10)
viewme.columns

Index(['Season', 'DayNum', 'Tm1ID', 'Tm2ID', 'TeamName_x', 'TeamName_y',
       'gmCity', 'gmState', 'dist_tm1', 'dist_tm2', 'tm1home', 'DOK_tm1',
       'MAS_tm1', 'MOR_tm1', 'PGH_tm1', 'POM_tm1', 'SAG_tm1', 'DOK_tm2',
       'MAS_tm2', 'MOR_tm2', 'PGH_tm2', 'POM_tm2', 'SAG_tm2', 'Tm1Score',
       'Tm2Score', 'tm1win'],
      dtype='object')

### 2. Create train/test split

In [208]:
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

""" 
==============
A. Create train/test
==============
"""
train, test = train_test_split(df)

x_vars = ['dist_tm1','dist_tm2','tm1home'] + [i + '_tm1' for i in rs] + [i + '_tm2' for i in rs]
y_var = 'tm1win'

x, y = train[x_vars], train[y_var]
test_x, test_y = test[x_vars], test[y_var]

['dist_tm1',
 'dist_tm2',
 'tm1home',
 'DOK_tm1',
 'MAS_tm1',
 'MOR_tm1',
 'PGH_tm1',
 'POM_tm1',
 'SAG_tm1',
 'DOK_tm2',
 'MAS_tm2',
 'MOR_tm2',
 'PGH_tm2',
 'POM_tm2',
 'SAG_tm2']

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

### 3. Test Logistic Regressions

In [147]:
def testLogisticCV(x, y, pipe1, pipe2, **kwargs):
    if not base_model:
        base_model = LogisticRegressionCV(cv = 5, scoring = 'neg_log_loss', max_iter = 10000)
    
    model = make_pipeline(pipe1, pipe2, base_model)
    model.fit(x, y)
    print(f"Number of coefficients in model is {len(model[-1].coef_[0])}")
    
    print(f"-Selected C is {model[-1].C_}")
    best_score = max([float(sum(col))/len(col) for col in zip(*model[-1].scores_[1])])
    print(f"-Best score is {-best_score}\n")
    return model

{'cv' : 5, 'scoring' : 'neg_log_loss', 'max_iter' : 10000}
test = testLogisticCV(x, y, None, None)
test1 = testLogisticCV(x, y, None, MaxAbsScaler())
test2 = testLogisticCV(x, y, None, StandardScaler())
test3 = testLogisticCV(x, y, PolynomialFeatures(2), MaxAbsScaler())
test4 = testLogisticCV(x, y, PolynomialFeatures(2), StandardScaler())

Number of coefficients in model is 15
-Selected C is [0.04641589]
-Best score is -0.5363199771800611

Number of coefficients in model is 15
-Selected C is [2.7825594]
-Best score is -0.5361885958665942

Number of coefficients in model is 15
-Selected C is [0.04641589]
-Best score is -0.5361862635920144

Number of coefficients in model is 136
-Selected C is [0.35938137]
-Best score is -0.5348951931304966

Number of coefficients in model is 136
-Selected C is [0.04641589]
-Best score is -0.5350621140637838



In [199]:
from sklearn.model_selection import GridSearchCV

logArgs = {'max_iter' : 10000, 'penalty' : 'elasticnet','solver' : 'saga'}
model = LogisticRegression(**logArgs)
params = {'C' : [1, 0.5, 0.33333, 0.1, 0.05, 0.01], 
                'l1_ratio' : [0.1, 0.25, 0.5, 0.75, 0.9]}

grid_search = GridSearchCV(model, param_grid = params, scoring = 'neg_log_loss', n_jobs = 4, cv = 5, verbose = 3, refit = True)
grid_pipe = make_pipeline(PolynomialFeatures(2), MaxAbsScaler(), grid_search)
grid_pipe.fit(x, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 4/5] END ................C=1, l1_ratio=0.1;, score=-0.531 total time=  17.1s
[CV 3/5] END ................C=1, l1_ratio=0.1;, score=-0.531 total time=  18.3s
[CV 1/5] END ................C=1, l1_ratio=0.1;, score=-0.537 total time=  19.2s
[CV 2/5] END ................C=1, l1_ratio=0.1;, score=-0.541 total time=  19.4s
[CV 1/5] END ...............C=1, l1_ratio=0.25;, score=-0.537 total time=  20.6s
[CV 5/5] END ................C=1, l1_ratio=0.1;, score=-0.535 total time=  22.8s
[CV 2/5] END ...............C=1, l1_ratio=0.25;, score=-0.541 total time=  23.1s
[CV 3/5] END ...............C=1, l1_ratio=0.25;, score=-0.531 total time=  24.4s
[CV 4/5] END ...............C=1, l1_ratio=0.25;, score=-0.531 total time=  18.5s
[CV 5/5] END ...............C=1, l1_ratio=0.25;, score=-0.535 total time=  22.7s
[CV 1/5] END ................C=1, l1_ratio=0.5;, score=-0.537 total time=  20.8s
[CV 2/5] END ................C=1, l1_ratio=0.5;

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('maxabsscaler', MaxAbsScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=5,
                              estimator=LogisticRegression(max_iter=10000,
                                                           penalty='elasticnet',
                                                           solver='saga'),
                              n_jobs=4,
                              param_grid={'C': [1, 0.5, 0.33333, 0.1, 0.05,
                                                0.01],
                                          'l1_ratio': [0.1, 0.25, 0.5, 0.75,
                                                       0.9]},
                              scoring='neg_log_loss', verbose=3))])

In [206]:
print(grid_pipe['gridsearchcv'].best_params_)
# Best logReg parameters:
# {'C': 0.33333, 'l1_ratio': 0.9}
grid_pipe['gridsearchcv'].best_score_

{'C': 0.33333, 'l1_ratio': 0.9}


-0.5347910680294838

In [222]:
log_loss(test_y, grid_pipe.predict_proba(test_x))

0.5352805002782844

### 4. Test Random Forest Model

In [158]:
from sklearn.ensemble import RandomForestClassifier

def testRFC(x, y, pipe1, pipe2, base = None, **kwargs):
    if not base:
        base = RandomForestClassifier(**kwargs)

    model1 = make_pipeline(pipe2, pipe2, base)
    scores = cross_val_score(model1, x, y, cv = 5, scoring = 'neg_log_loss')
    res = scores.sum() / len(scores)
    print(res)
    return model1

rfc_args1 = {'n_estimators' : 100, 'min_samples_split' : 2, 'max_features' : 'sqrt'}
test = testRFC(x, y, None, None, **rfc_args1)
test1 = testRFC(x, y, None, StandardScaler(), **rfc_args1)
test2 = testRFC(x, y, None, MaxAbsScaler(), **rfc_args1)
test3 = testRFC(x, y, PolynomialFeatures(2), StandardScaler(), **rfc_args1)
test4 = testRFC(x, y, PolynomialFeatures(2), MaxAbsScaler(), **rfc_args1)

-0.5587977912520894
-0.5574449312595464
-0.559568884122036
-0.5625729288148309
-0.5619193013514602


In [160]:
rfc_args2 = {'n_estimators' : 100, 'min_samples_split' : 2, 'max_features' : None}
test5 = testRFC(x, y, None, None, **rfc_args2)
test6 = testRFC(x, y, None, StandardScaler(), **rfc_args2)
test7 = testRFC(x, y, None, MaxAbsScaler(), **rfc_args2)

-0.5698499966901054
-0.5702145113950527
-0.5695731675471662


In [161]:
rfc_args3 = {'n_estimators' : 250, 'min_samples_split' : 2, 'max_features' : 'sqrt'}
test8 = testRFC(x, y, None, None, **rfc_args3)
test9 = testRFC(x, y, None, StandardScaler(), **rfc_args3)
test10 = testRFC(x, y, None, MaxAbsScaler(), **rfc_args3)

-0.5496295211045484
-0.5495997582630154
-0.5494138547961785


In [163]:
rfc_args3 = {'n_estimators' : 1000, 'min_samples_split' : 2, 'max_features' : 'sqrt', 'n_jobs' : 2}
test8 = testRFC(x, y, None, None, **rfc_args3)
test9 = testRFC(x, y, None, StandardScaler(), **rfc_args3)
test10 = testRFC(x, y, None, MaxAbsScaler(), **rfc_args3)

-0.5459776933484449
-0.5464052434785265
-0.5462245924273246


### 4. Test Boosting Methods

In [214]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


{'subsample': 0.6,
 'min_child_weight': 10,
 'max_depth': 5,
 'learning_rate': 0.01,
 'gamma': 2,
 'colsample_bytree': 0.6}

xgb = XGBClassifier(n_estimators=500, objective='binary:logistic', nthread=1)
params = {
        'learning_rate' : [0.05, 0.01, 0.005],
        'min_child_weight': [10],
        'gamma': [1.5, 2, 5],
        'subsample': [0.5, 0.6, 0.7],
        'colsample_bytree': [0.6],
        'max_depth': [5]
        }

# xgb_search = GridSearchCV(xgb, param_grid = params, scoring = 'neg_log_loss', n_jobs = 4, cv = 3, verbose = 3, refit = True)
# xgb_pipe = make_pipeline(PolynomialFeatures(2), MaxAbsScaler(), xgb_search)
# xgb_pipe.fit(x, y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 2/3] END colsample_bytree=0.6, gamma=1.5, learning_rate=0.05, max_depth=5, min_child_weight=10, subsample=0.5;, score=-0.534 total time= 3.2min
[CV 3/3] END colsample_bytree=0.6, gamma=1.5, learning_rate=0.05, max_depth=5, min_child_weight=10, subsample=0.5;, score=-0.534 total time= 3.2min
[CV 1/3] END colsample_bytree=0.6, gamma=1.5, learning_rate=0.05, max_depth=5, min_child_weight=10, subsample=0.5;, score=-0.539 total time= 3.2min
[CV 1/3] END colsample_bytree=0.6, gamma=1.5, learning_rate=0.05, max_depth=5, min_child_weight=10, subsample=0.6;, score=-0.538 total time= 3.4min
[CV 2/3] END colsample_bytree=0.6, gamma=1.5, learning_rate=0.05, max_depth=5, min_child_weight=10, subsample=0.6;, score=-0.533 total time= 3.7min
[CV 3/3] END colsample_bytree=0.6, gamma=1.5, learning_rate=0.05, max_depth=5, min_child_weight=10, subsample=0.6;, score=-0.533 total time= 3.7min
[CV 1/3] END colsample_bytree=0.6, gamma=1.5, learn

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('maxabsscaler', MaxAbsScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=3,
                              estimator=XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=No...
                                                      missing=nan

In [228]:
xgb_pipe['gridsearchcv'].best_estimator_
xgb_pipe['gridsearchcv'].best_score_
xgb_pipe['gridsearchcv'].best_params_

# Best xgb parameters:
# {'colsample_bytree': 0.6,
#  'gamma': 2,
#  'learning_rate': 0.01,
#  'max_depth': 5,
#  'min_child_weight': 10,
#  'subsample': 0.5}

{'colsample_bytree': 0.6,
 'gamma': 2,
 'learning_rate': 0.01,
 'max_depth': 5,
 'min_child_weight': 10,
 'subsample': 0.5}

In [225]:
log_loss(test_y, xgb_pipe.predict_proba(test_x))

0.5324390394396052

In [231]:
with open('../data/my_data/models/xgb_model.p', 'wb') as f:
    pickle.dump(xgb_pipe, f,  protocol=pickle.HIGHEST_PROTOCOL)