In [1]:
import pickle

import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

In [2]:
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [41]:
with open('d_rolling.pickle','rb') as read_file:
    df = pickle.load(read_file)
    
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling_v,ast_per_opp_rolling_v,stl_per_rolling_v,stl_per_opp_rolling_v,blk_per_rolling_v,blk_per_opp_rolling_v,user_per_rolling_v,user_per_opp_rolling_v,drtg_rolling_v,drtg_opp_rolling_v
0,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203.0,POR,SAS,19.0,13.5,...,,,,,,,,,,
1,2007-10-30,214.5,212.0,0708,20071030GoldenState,213.0,UTA,GSW,-1.5,1.0,...,,,,,,,,,,
2,2007-10-30,191.0,199.0,0708,20071030LALakers,188.0,HOU,LAL,-3.0,-11.0,...,,,,,,,,,,
3,2007-10-31,190.0,191.0,0708,20071031Toronto,203.0,PHI,TOR,13.0,12.0,...,,,,,,,,,,
4,2007-10-31,200.0,203.5,0708,20071031Indiana,229.0,WAS,IND,29.0,25.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14617,2019-06-02,216.0,213.5,1819,20190602Toronto,213.0,GSW,TOR,-3.0,-0.5,...,71.84,65.06,8.58,6.78,11.38,7.94,100.0,100.0,110.58,116.52
14618,2019-06-05,214.0,209.5,1819,20190605GoldenState,232.0,TOR,GSW,18.0,22.5,...,61.52,77.52,7.70,6.42,9.08,9.68,100.0,100.0,109.72,117.08
14619,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,...,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
14620,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00


## EDA and some more Cleaning

Let's filter out our NaN's (games 1-5 in every season for every team).

In [46]:
df = df[(df['game_num'] > 5) & (df['game_num_v'] > 5)]
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling_v,ast_per_opp_rolling_v,stl_per_rolling_v,stl_per_opp_rolling_v,blk_per_rolling_v,blk_per_opp_rolling_v,user_per_rolling_v,user_per_opp_rolling_v,drtg_rolling_v,drtg_opp_rolling_v
77,2007-11-10,205.5,209.0,0708,20071110Orlando,202.0,PHO,ORL,-3.5,-7.0,...,60.82,48.82,9.62,7.98,4.36,9.28,100.0,100.0,98.56,108.30
86,2007-11-11,185.0,186.5,0708,20071111SanAntonio,201.0,MIL,SAS,16.0,14.5,...,55.22,60.74,6.38,7.16,9.06,9.50,100.0,100.0,102.14,100.80
87,2007-11-11,199.0,196.0,0708,20071111Seattle,210.0,DET,SEA,11.0,14.0,...,60.04,52.12,7.98,6.20,9.40,5.68,100.0,100.0,97.20,109.70
89,2007-11-12,189.0,184.5,0708,20071112NewJersey,166.0,NOH,NJN,-23.0,-18.5,...,58.20,59.82,9.36,6.96,5.22,7.52,100.0,100.0,98.46,103.76
90,2007-11-12,205.5,208.0,0708,20071112Denver,222.0,CLE,DEN,16.5,14.0,...,51.30,48.54,8.78,7.66,7.26,6.56,100.0,100.0,102.88,103.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14617,2019-06-02,216.0,213.5,1819,20190602Toronto,213.0,GSW,TOR,-3.0,-0.5,...,71.84,65.06,8.58,6.78,11.38,7.94,100.0,100.0,110.58,116.52
14618,2019-06-05,214.0,209.5,1819,20190605GoldenState,232.0,TOR,GSW,18.0,22.5,...,61.52,77.52,7.70,6.42,9.08,9.68,100.0,100.0,109.72,117.08
14619,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,...,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
14620,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00


Let's start with the "Four Factors". These are efficiency stats that author, Dean Oliver, identified as the most influential components to the outcome of a game, and cover 4 key areas of any basketball game: shooting (eFg), turnovers (tov), rebounding (orb), and free throws (ft_fga).

In [47]:
counter = 0
for column in df.columns:
    print(counter, '-',column)
    counter += 1

0 - Date
1 - O/U_open
2 - O/U_close
3 - Season
4 - id
5 - total
6 - away
7 - home
8 - ou1
9 - ou2
10 - Over/Under_open
11 - Over/Under_close
12 - team
13 - opp
14 - home/away
15 - pace
16 - pace_opp
17 - eFg
18 - eFg_opp
19 - tov
20 - tov_opp
21 - orb
22 - orb_opp
23 - ft_fga
24 - ft_fga_opp
25 - ortg
26 - ortg_opp
27 - fg
28 - fg_opp
29 - fga
30 - fga_opp
31 - fg_per
32 - fg_per_opp
33 - threes
34 - threes_opp
35 - threes_att
36 - threes_att_opp
37 - threes_per
38 - threes_per_opp
39 - ft
40 - ft_opp
41 - fta
42 - fta_opp
43 - ft_per
44 - ft_per_opp
45 - drb
46 - drb_opp
47 - trb
48 - trb_opp
49 - ast
50 - ast_opp
51 - stl
52 - stl_opp
53 - blk
54 - blk_opp
55 - to
56 - to_opp
57 - fouls
58 - fouls_opp
59 - ts_per
60 - ts_per_opp
61 - threes_ar
62 - threes_ar_opp
63 - ft_ar
64 - ft_ar_opp
65 - drb_per
66 - drb_per_opp
67 - trb_per
68 - trb_per_opp
69 - ast_per
70 - ast_per_opp
71 - stl_per
72 - stl_per_opp
73 - blk_per
74 - blk_per_opp
75 - user_per
76 - user_per_opp
77 - drtg
78 - dr

In [145]:
df1 = df.iloc[:,[10,11,83,84,85,86,87,88,89,90,227,228,229,230,231,232,233,234]]
df1

Unnamed: 0,Over/Under_open,Over/Under_close,eFg_rolling,eFg_opp_rolling,tov_rolling,tov_opp_rolling,orb_rolling,orb_opp_rolling,ft_fga_rolling,ft_fga_opp_rolling,eFg_rolling_v,eFg_opp_rolling_v,tov_rolling_v,tov_opp_rolling_v,orb_rolling_v,orb_opp_rolling_v,ft_fga_rolling_v,ft_fga_opp_rolling_v
77,0.0,0.0,0.5200,0.4894,12.72,12.92,22.04,26.10,0.2880,0.2514,0.5172,0.4580,11.76,15.74,21.56,31.52,0.1662,0.1582
86,1.0,1.0,0.5116,0.4624,13.04,13.50,19.82,23.30,0.2064,0.1748,0.4654,0.4760,14.22,13.98,29.22,25.42,0.1892,0.2544
87,1.0,1.0,0.4858,0.4940,17.42,14.54,26.10,27.80,0.1878,0.2546,0.5104,0.4502,12.80,14.42,27.00,23.82,0.2466,0.2468
89,0.0,0.0,0.4468,0.4780,16.10,13.92,22.84,22.86,0.2802,0.2946,0.4768,0.4610,12.06,13.08,24.88,24.70,0.1830,0.1598
90,1.0,1.0,0.5046,0.5182,16.16,16.60,22.76,28.54,0.2424,0.2022,0.4838,0.4598,13.52,13.12,29.46,20.52,0.1776,0.3258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14617,0.0,0.0,0.5324,0.4954,9.88,12.56,18.28,21.68,0.2694,0.2498,0.5484,0.5136,13.42,12.10,26.20,21.24,0.2394,0.2360
14618,1.0,1.0,0.5368,0.5190,13.58,11.02,23.96,20.32,0.2524,0.2200,0.5236,0.5158,9.78,12.44,21.40,20.34,0.2772,0.2554
14619,0.0,0.0,0.5158,0.5352,13.02,11.24,23.22,19.20,0.2562,0.2292,0.5388,0.5020,10.44,12.48,19.78,22.34,0.2708,0.2678
14620,0.0,0.0,0.5390,0.4996,10.98,13.60,19.56,22.62,0.2646,0.2726,0.5086,0.5376,13.86,10.94,23.54,18.78,0.2444,0.2356


Before we can do any data exploration, we need to split our dataset between training/validating & testing. This way we can avoid biasing our decision making on model development based off our testing data to more accurately guage how our model does on "fresh" data.

In [146]:
from sklearn.model_selection import train_test_split

In [147]:
#EXPLORING FOUR FACTORS FILTERED DF
df_train, df_test = train_test_split(df1,
                                     test_size=0.2, 
                                     random_state=42)

In [148]:
df_train

Unnamed: 0,Over/Under_open,Over/Under_close,eFg_rolling,eFg_opp_rolling,tov_rolling,tov_opp_rolling,orb_rolling,orb_opp_rolling,ft_fga_rolling,ft_fga_opp_rolling,eFg_rolling_v,eFg_opp_rolling_v,tov_rolling_v,tov_opp_rolling_v,orb_rolling_v,orb_opp_rolling_v,ft_fga_rolling_v,ft_fga_opp_rolling_v
4548,0.0,0.0,0.4710,0.5340,10.66,13.12,22.12,28.20,0.2496,0.3122,0.5190,0.5074,11.76,11.02,25.44,23.22,0.1780,0.2310
8364,1.0,1.0,0.4912,0.5690,15.06,12.36,30.82,22.42,0.2238,0.2106,0.4870,0.5440,11.34,15.82,24.08,29.60,0.1794,0.2580
10889,1.0,1.0,0.4898,0.4880,14.90,13.14,20.92,22.22,0.2028,0.1908,0.5052,0.5002,13.76,14.38,29.92,24.36,0.2534,0.2302
4534,0.0,0.0,0.5218,0.5246,14.24,13.44,26.62,29.72,0.2472,0.2516,0.4864,0.4836,10.38,12.72,24.92,26.72,0.2496,0.2458
11519,0.0,0.0,0.5318,0.5254,9.92,11.52,19.22,21.46,0.2590,0.2070,0.4866,0.5094,14.24,9.06,22.38,27.08,0.1628,0.2080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5654,0.0,0.0,0.5112,0.4866,11.32,15.44,29.80,29.34,0.1888,0.2102,0.4982,0.4772,15.80,11.90,22.90,27.00,0.2468,0.2242
14523,1.0,1.0,0.4970,0.5682,8.98,13.10,24.00,23.42,0.1688,0.1730,0.5290,0.5126,11.90,11.52,19.08,21.24,0.1772,0.1954
5853,0.0,0.0,0.5020,0.4924,12.80,8.74,35.42,24.84,0.2480,0.1602,0.5344,0.4844,15.44,10.12,28.46,28.28,0.2258,0.1394
953,0.0,0.0,0.5124,0.5342,12.46,10.58,23.36,26.82,0.1438,0.2480,0.5410,0.4860,12.92,9.54,26.32,19.78,0.3078,0.2458


In [149]:
df_train.describe()

Unnamed: 0,Over/Under_open,Over/Under_close,eFg_rolling,eFg_opp_rolling,tov_rolling,tov_opp_rolling,orb_rolling,orb_opp_rolling,ft_fga_rolling,ft_fga_opp_rolling,eFg_rolling_v,eFg_opp_rolling_v,tov_rolling_v,tov_opp_rolling_v,orb_rolling_v,orb_opp_rolling_v,ft_fga_rolling_v,ft_fga_opp_rolling_v
count,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0,10813.0
mean,0.496717,0.49607,0.505336,0.504532,12.66064,12.656251,24.926542,24.979003,0.21797,0.217842,0.505675,0.5039,12.631303,12.668826,25.031695,24.922497,0.217728,0.217096
std,0.500012,0.500008,0.035233,0.033925,1.764226,1.808328,4.361173,4.028045,0.044663,0.042803,0.035392,0.033962,1.762956,1.845521,4.402477,3.943722,0.044481,0.043004
min,0.0,0.0,0.3882,0.37,6.76,6.0,10.86,10.0,0.0832,0.0994,0.3712,0.3772,6.08,6.54,9.94,12.18,0.0828,0.0936
25%,0.0,0.0,0.4814,0.4814,11.44,11.4,21.92,22.24,0.187,0.1876,0.4814,0.481,11.42,11.42,22.0,22.22,0.1862,0.1868
50%,0.0,0.0,0.5048,0.5044,12.64,12.6,24.78,24.94,0.2146,0.2146,0.5054,0.5034,12.6,12.6,24.98,24.86,0.2146,0.2144
75%,1.0,1.0,0.5288,0.5276,13.84,13.86,27.78,27.64,0.246,0.2446,0.5292,0.5262,13.8,13.9,27.98,27.54,0.2458,0.2446
max,1.0,1.0,0.6274,0.6254,20.28,21.08,43.68,43.86,0.4188,0.4326,0.6462,0.6368,19.28,20.24,41.5,39.48,0.43,0.4168


As we can see, 2 of the Four Factors are in decimal form, and the other 2 are in percentage form. Let's standardize this with the StandardScalar package.

In [150]:
from sklearn.preprocessing import StandardScaler

In [151]:
scalar = StandardScaler()

In [152]:
X_train = scalar.fit_transform(df_train.iloc[:,2:])
y_train = df_train['Over/Under_close']

## Simple Logistic Regression

Now that we've scaled our data let's see how it fits in a Logistic Regression model.

In [153]:
from sklearn.linear_model import LogisticRegression

In [154]:
lm_1 = LogisticRegression()

In [155]:
lm_1.fit(X_train, y_train)

LogisticRegression()

In [156]:
print('intercept: ', round(lm_1.intercept_[0], 4))
print('over coef: ', round(lm_1.coef_[0][0], 4))

intercept:  -0.0158
over coef:  0.0033


Our coefficient is negative, which suggests this model doesn't work well with our data, but let's see how our predictions match up with reality.

In [157]:
df_eval = df_test.copy()
df_eval['pred'] = lm_1.predict(df_test.iloc[:,2:])
df_eval.loc[:, 'pred'] = df_eval['pred'].astype('category')
df_eval['correct_pred'] = df_eval['pred'] == df_eval['Over/Under_close']

In [158]:
over_proba = lm_1.predict_proba(df_test.iloc[:,2:])
over_proba[:5]

array([[0.50553911, 0.49446089],
       [0.48888147, 0.51111853],
       [0.47408454, 0.52591546],
       [0.4346059 , 0.5653941 ],
       [0.48289707, 0.51710293]])

The first column represents the probability of y=0, or our target variable being 0, which in our context means the probability that the game is an Under, and the second column being the probability a game is classified as an Over. The first five entries of our test set are all classified as Overs, which is a bit concerning as it could be classifying every game as an Over. Let's investigate some metrics to see if that's true.

In [159]:
from sklearn.metrics import precision_score,recall_score,confusion_matrix,f1_score

In [160]:
confusion_matrix(df_eval['Over/Under_close'], df_eval['pred'])

array([[ 239, 1138],
       [ 234, 1093]])

Looking at this confusion matrix we can see that it classified all of our games as Overs, which means our data didn't have enough separability to determine a valuable threshold line, and put the line on the edge of our dataset. We'll have to probably use a different model to better predict our classification.

In [161]:
precision_score(df_eval['Over/Under_close'], df_eval['pred'])

0.4899148363962349

In [162]:
recall_score(df_eval['Over/Under_close'], df_eval['pred'])

0.8236623963828184

This pair of precision and recall scores confirms our confusion matrix takeaway, that our data is being classified completely as Overs, and leading to a little over half our data being misclassified as such. We will definitely need to use a different model.

In [163]:
f1_score(df_eval['Over/Under_close'], df_eval['pred'])

0.6143901068015739

## K-Nearest Neighbors

In [164]:
from sklearn.neighbors import KNeighborsRegressor

In [165]:
knn = KNeighborsRegressor(n_neighbors=5)

In [166]:
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [167]:
df_eval = df_test.copy()
df_eval['pred'] = knn.predict(df_test.iloc[:,2:])
df_eval.loc[:, 'pred'] = df_eval['pred'].astype('category')
df_eval['correct_pred'] = df_eval['pred'] == df_eval['Over/Under_close']

In [168]:
over_proba = knn.predict_proba(df_test.iloc[:,2:])
over_proba[:5]

AttributeError: 'KNeighborsRegressor' object has no attribute 'predict_proba'

In [169]:
confusion_matrix(df_eval['Over/Under_close'], df_eval['pred'])

ValueError: Classification metrics can't handle a mix of binary and continuous targets

## Decision Tree Classifier

In [170]:
from sklearn.tree import DecisionTreeClassifier

In [243]:
tree_clf = DecisionTreeClassifier(max_depth = 11)

In [244]:
tree_clf.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=11)

In [245]:
df_eval = df_test.copy()
df_eval['pred'] = tree_clf.predict(df_test.iloc[:,2:])
df_eval.loc[:, 'pred'] = df_eval['pred'].astype('category')
df_eval['correct_pred'] = df_eval['pred'] == df_eval['Over/Under_close']

In [246]:
over_proba = tree_clf.predict_proba(df_test.iloc[:,2:])
over_proba[:5]

array([[0.45614035, 0.54385965],
       [0.45614035, 0.54385965],
       [0.45614035, 0.54385965],
       [0.45614035, 0.54385965],
       [0.45614035, 0.54385965]])

In [247]:
confusion_matrix(df_eval['Over/Under_close'], df_eval['pred'])

array([[   0, 1377],
       [   0, 1327]])

In [242]:
f1_score(df_eval['Over/Under_close'], df_eval['pred'])

0.0

## Scratch Work

In [40]:
precisions, recalls, thresholds = precision_recall_curve(df_eval['Over/Under_close'],over_proba)

ValueError: y should be a 1d array, got an array of shape (5489, 2) instead.

In [None]:
#Pair plots to find some feature where we have some seperation. Starting with Four Factors...
sns.pairplot(df_train.iloc[:,[1,2,3]], corner=True, height=1.5, plot_kws={'size': 3}, hue='Over/Under_close');

In [None]:
sns.pairplot(df_train.iloc[:,[1,4,5]], corner=True, height=1.5, plot_kws={'size': 3}, hue='Over/Under_close');