In [1]:
from helper_functions import *

In [2]:
pwd

'/Users/jeremymobley/Desktop/cbb_modeling/Python'

In [3]:
cd /Users/jeremymobley/Desktop/cbb_modeling/data/2018/

/Users/jeremymobley/Desktop/cbb_modeling/data/2018


In [4]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np

#### Load Data

In [5]:
teams = read_in_file_to_df("Teams")
tourney_compact_results = read_in_file_to_df("NCAATourneyCompactResults")
tourney_detailed_results = read_in_file_to_df("NCAATourneyDetailedResults")
reg_season_detailed_results = read_in_file_to_df("RegularSeasonDetailedResults")
reg_season_compact_results = read_in_file_to_df("RegularSeasonCompactResults")
sample_sub = read_in_file_to_df("SampleSubmissionStage1")
seasons = read_in_file_to_df("Seasons")

Teams (364, 4)
NCAATourneyCompactResults (2117, 8)
NCAATourneyDetailedResults (981, 34)
RegularSeasonDetailedResults (76636, 34)
RegularSeasonCompactResults (150684, 8)
SampleSubmissionStage1 (9112, 2)
Seasons (34, 6)


#### Combine regular season and tourney detailed results

In [6]:
all_detailed_results = pd.concat([reg_season_detailed_results, tourney_detailed_results])
print(all_detailed_results.shape)

(77617, 34)


### Calculate Advanced Metrics

#### Add in possession features

In [7]:
all_detailed_results = create_poss_fields(all_detailed_results)
all_detailed_results = create_ppp_fields(all_detailed_results)

#### Add advanced metrics
Takes a few seconds...

In [8]:
all_detailed_results = add_advanced_metrics(all_detailed_results)

#### Add in team names

In [8]:
all_detailed_results = add_in_team_names(all_detailed_results, teams)
print(all_detailed_results.shape)

(77617, 41)


#### Add in date from DayNum

In [9]:
all_detailed_results = add_in_date(all_detailed_results, seasons)
print(all_detailed_results.shape)

(77617, 42)


In [10]:
# Add in score_diff field
all_detailed_results['score_diff'] = all_detailed_results['WScore'] - all_detailed_results['LScore']

## Create train data set

Train data set structure:  
* Result field is binary 0/1 representing if Team1 beats Team2
* Every game is represented by 2 different records

In [11]:
train = create_master_train(all_detailed_results)
print(train.shape)

(155234, 44)


#### Add in Massey Ordinals

In [12]:
# Read in massey_ordinals file
massey_ordinals = read_in_file_to_df("MasseyOrdinals")
massey_ordinals.columns = ['Season', 'DayNum', 'SystemName', 'TeamID', 'OrdinalRank']

MasseyOrdinals (3200201, 5)


In [13]:
train = add_in_massey_ordinal_field(train, "SAG", massey_ordinals)
train = add_in_massey_ordinal_field(train, "MAS", massey_ordinals)
print(train.shape)

(155234, 48)


## Look at all train records

In [77]:
HTML(train[(train['Season']==2017) & (train['t1_TeamID']==1207)].sort_values('DayNum').to_html())

Unnamed: 0,DayNum,NumOT,Result,Season,TotPoss,game_date,score_diff,t1_Ast,t1_AstR,t1_Blk,t1_DR,t1_DRP,t1_DefRtg,t1_FGA,t1_FGA3,t1_FGM,t1_FGM3,t1_FTA,t1_FTAR,t1_FTM,t1_Loc,t1_NetRtg,t1_OR,t1_ORP,t1_OffRtg,t1_PF,t1_PPP,t1_Poss,t1_RP,t1_Score,t1_Stl,t1_TO,t1_TOR,t1_TSP,t1_TeamID,t1_TeamName,t1_eFGP,t2_Ast,t2_AstR,t2_Blk,t2_DR,t2_DRP,t2_DefRtg,t2_FGA,t2_FGA3,t2_FGM,t2_FGM3,t2_FTA,t2_FTAR,t2_FTM,t2_NetRtg,t2_OR,t2_ORP,t2_OffRtg,t2_PF,t2_PPP,t2_Poss,t2_RP,t2_Score,t2_Stl,t2_TO,t2_TOR,t2_TSP,t2_TeamID,t2_TeamName,t2_eFGP,t1_SAG_rnk,t2_SAG_rnk,t1_MAS_rnk,t2_MAS_rnk
4103,12,0,1.0,2017,152.5,2016-11-12,45.0,22,20.936429,2,25,0.555556,39.344262,54,17,36,9,32,0.592593,24,H,0.0,8,0.380952,68.852459,24,1.377049,76.2,0.5,105,13,15,10.346125,77.115159,1207,Georgetown,0.75,11,10.346125,2,13,0.619048,68.852459,57,25,21,6,28,0.491228,12,-29.508197,20,0.444444,39.344262,26,0.786885,76.3,0.318182,60,6,26,10.346125,43.277553,1367,SC Upstate,0.421053,,,,
7595,15,0,0.0,2017,148.4,2016-11-15,-1.0,7,7.736516,4,24,0.727273,51.212938,50,15,16,6,42,0.84,37,H,-0.673854,11,0.305556,50.539084,24,1.010782,73.95,0.478261,75,6,15,7.736516,54.760514,1207,Georgetown,0.38,11,11.742101,7,25,0.694444,50.539084,58,27,26,7,22,0.37931,17,0.0,9,0.272727,51.212938,32,1.024259,74.45,0.492754,76,6,15,7.736516,56.146572,1268,Maryland,0.508621,,,,
9596,17,0,0.0,2017,142.5,2016-11-17,-6.0,14,14.724443,3,18,0.642857,54.736842,56,20,23,3,32,0.571429,23,H,-4.210526,12,0.315789,50.526316,25,1.010526,70.2,0.424242,72,5,11,14.724443,51.369863,1207,Georgetown,0.4375,12,12.85898,2,26,0.684211,50.526316,51,12,28,6,28,0.54902,16,0.0,10,0.357143,54.736842,29,1.094737,72.3,0.545455,78,3,18,14.724443,61.591914,1117,Arkansas St,0.607843,28.0,257.0,41.0,260.0
15182,21,0,1.0,2017,141.95,2016-11-21,4.0,10,10.451505,10,26,0.619048,42.972878,61,21,21,6,22,0.360656,17,N,0.0,14,0.318182,45.790771,21,0.915815,72.45,0.465116,65,8,15,10.548523,45.98189,1207,Georgetown,0.393443,10,10.548523,10,30,0.681818,45.790771,59,17,23,4,20,0.338983,11,-2.817894,16,0.380952,42.972878,22,0.859458,69.5,0.511628,61,7,17,10.548523,44.985251,1332,Oregon,0.423729,28.0,25.0,41.0,12.0
17282,22,0,0.0,2017,131.1,2016-11-22,-16.0,7,9.615385,1,20,0.5,55.682685,47,15,18,5,20,0.425532,16,N,-12.204424,1,0.032258,43.478261,16,0.869565,65.5,0.56338,57,6,10,9.615385,51.075269,1207,Georgetown,0.43617,12,12.366035,5,30,0.967742,43.478261,65,22,28,5,16,0.246154,12,0.0,20,0.5,55.682685,18,1.113654,65.6,0.704225,73,5,13,9.615385,50.666297,1458,Wisconsin,0.469231,28.0,4.0,41.0,24.0
18081,23,0,0.0,2017,153.85,2016-11-23,-27.0,11,11.238251,1,21,0.636364,63.048424,47,9,24,3,27,0.574468,19,N,-17.549561,13,0.433333,45.498863,18,0.909977,74.825,0.52381,70,5,28,11.238251,59.442935,1207,Georgetown,0.542553,18,16.611296,2,17,0.566667,45.498863,71,19,36,7,19,0.267606,18,0.0,12,0.363636,63.048424,23,1.260968,79.025,0.460317,97,19,11,11.238251,61.113911,1329,Oklahoma St,0.556338,28.0,52.0,41.0,81.0
23972,27,0,1.0,2017,148.75,2016-11-27,13.0,21,20.084162,7,23,0.638889,48.403361,55,28,27,13,24,0.436364,18,H,0.0,10,0.344828,57.142857,20,1.142857,74.4,0.507692,85,11,18,7.491438,64.826113,1207,Georgetown,0.609091,7,7.491438,2,19,0.655172,57.142857,58,12,25,4,26,0.448276,18,-8.739496,13,0.361111,48.403361,19,0.968067,74.35,0.446154,72,7,17,7.491438,51.843318,1224,Howard,0.465517,53.0,306.0,86.0,342.0
28004,30,0,1.0,2017,133.725,2016-11-30,52.0,22,20.117045,9,34,0.723404,32.903346,68,31,34,14,19,0.279412,14,H,0.0,21,0.583333,71.789119,18,1.435782,67.025,0.662651,96,6,11,10.194835,62.860136,1207,Georgetown,0.602941,9,10.194835,5,15,0.416667,71.789119,59,24,16,7,12,0.20339,5,-38.885773,13,0.276596,32.903346,17,0.658067,66.7,0.433735,44,2,15,10.194835,34.225264,1164,Coppin St,0.330508,53.0,346.0,86.0,336.0
33278,34,0,1.0,2017,138.725,2016-12-04,3.0,13,15.194016,9,28,0.682927,53.342945,52,14,28,4,24,0.461538,17,H,0.0,4,0.148148,55.505496,16,1.11011,69.4,0.470588,77,6,10,16.313214,61.540921,1207,Georgetown,0.576923,16,16.313214,1,23,0.851852,55.505496,69,28,29,11,7,0.101449,5,-2.162552,13,0.317073,53.342945,17,1.066859,69.325,0.397059,74,4,10,16.313214,51.331853,1189,Elon,0.5,82.0,140.0,90.0,147.0
37136,40,0,1.0,2017,150.1,2016-12-10,15.0,13,14.457295,9,31,0.775,51.965356,47,12,25,7,43,0.914894,36,N,0.0,5,0.185185,61.958694,26,1.239174,73.425,0.537313,93,5,11,11.515913,70.540049,1207,Georgetown,0.606383,11,11.515913,1,22,0.814815,61.958694,61,21,25,6,33,0.540984,22,-9.993338,9,0.225,51.965356,32,1.039307,76.675,0.402985,78,6,9,11.515913,51.641949,1247,La Salle,0.459016,65.0,96.0,83.0,128.0


---

## DEV

#### Add rolling features - working..

In [14]:
train = create_rolling_avg_feature(train, "t1_PPP", num_games=5)
train = create_rolling_avg_feature(train, "t2_PPP", num_games=5)

In [70]:
num_games = 5
#train = train.sort_values(['t1_TeamID', 'Season','DayNum']).reset_index()
agg_funcs = {'t1_PPP':['mean', 'max'], 't2_PPP':['mean']}
train_metrics_agg = train.groupby(['Season', 't1_TeamID']).shift(1) \
    .rolling(window=num_games, min_periods=num_games).agg(agg_funcs)

In [75]:
train_metrics_agg.head(10)

Unnamed: 0,t1_PPP_mean,t1_PPP_max,t2_PPP_mean
0,,,
1,,,
2,,,
3,,,
4,,,
5,0.806336,1.004315,1.218981
6,0.804361,1.004315,1.250998
7,0.807975,1.004315,1.253397
8,0.879251,1.043478,1.268299
9,0.947804,1.043478,1.179179


In [74]:
train_metrics_agg.columns = ['t1_PPP_mean', 't1_PPP_max', 't2_PPP_mean']

In [76]:
train_metrics_agg.shape

(155234, 3)

---

# Modeling

* Build models to predict 0/1 winner - logistic regression
* Build models to predict point spread - secondary model to convert pointspreads to percentages

In [14]:
from sklearn import linear_model

In [45]:
from sklearn.utils import shuffle

In [15]:
logistic = linear_model.LogisticRegression()

In [21]:
logistic_model_fields = ['t1_SAG_rnk', 't2_SAG_rnk', "t1_MAS_rnk", "t2_MAS_rnk"]

In [47]:
# Create modeling dataframe
model_train_df = shuffle(train.copy())
print(model_train_df.shape)
model_train_df = model_train_df[model_train_df['DayNum'] > 50]
model_train_df = model_train_df[model_train_df['DayNum'] < 130]
model_train_df = model_train_df.dropna()
print(model_train_df.shape)

(155234, 48)
(75010, 48)


In [52]:
model_train_df.groupby('Season').size()

Season
2003    6608
2004    6502
2005    6536
2010    6890
2011    6920
2012    6946
2013    6832
2014    6888
2015    6922
2016    6930
2017    7036
dtype: int64

In [48]:
logistic.fit(model_train_df[logistic_model_fields], 
             model_train_df['Result'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [49]:
# Create test games
model_test_df = train.copy()
model_test_df = model_test_df[model_test_df['DayNum'] > 130]
model_test_df = model_test_df.dropna()
print(model_test_df.shape)

(2040, 48)


In [50]:
# Create preds
preds = logistic.predict_proba(model_test_df[logistic_model_fields])[:, 1]

In [51]:
calc_logloss(preds, model_test_df['Result'])

0.5929098747640904

### GBM

In [59]:
from sklearn.ensemble import GradientBoostingClassifier

In [60]:
gbm = GradientBoostingRegressor(n_estimators=10, 
                                max_depth=8, 
                                learning_rate=0.01)

In [61]:
gbm.fit(model_train_df[logistic_model_fields], model_train_df['Result'])

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=8, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [62]:
# create preds
preds = gbm.predict(model_test_df[logistic_model_fields])

In [63]:
calc_logloss(preds, model_test_df['Result'])

0.6753891531554352