In [1]:
import pickle

import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

In [2]:
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [3]:
with open('d_rolling.pickle','rb') as read_file:
    df = pickle.load(read_file)
    
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling
0,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203.0,POR,SAS,19.0,13.5,...,,,,,,,,,,
1,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203.0,POR,SAS,19.0,13.5,...,,,,,,,,,,
2,2007-10-30,214.5,212.0,0708,20071030GoldenState,213.0,UTA,GSW,-1.5,1.0,...,,,,,,,,,,
3,2007-10-30,214.5,212.0,0708,20071030GoldenState,213.0,UTA,GSW,-1.5,1.0,...,,,,,,,,,,
4,2007-10-30,191.0,199.0,0708,20071030LALakers,188.0,HOU,LAL,-3.0,-11.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,...,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,59.84,76.36,8.86,7.18,10.56,9.24,100.0,100.0,108.36,116.22
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224.0,TOR,GSW,13.0,12.5,...,77.92,58.72,6.60,8.50,8.90,11.10,100.0,100.0,115.12,108.88


## EDA and some more Cleaning

Let's filter out our NaN's (games 1-5 in every season for every team).

In [4]:
df = df[df['game_num'] > 5]
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling
129,2007-11-09,188.0,187.5,0708,20071109Philadelphia,208.0,TOR,PHI,20.0,20.5,...,59.00,65.50,8.76,4.34,6.32,9.14,100.0,100.0,104.40,106.62
135,2007-11-09,198.5,201.5,0708,20071109NewYork,214.0,ORL,NYK,15.5,12.5,...,57.58,52.62,4.88,5.94,6.50,8.06,100.0,100.0,105.86,111.16
140,2007-11-09,188.0,186.0,0708,20071109NewOrleans,182.0,SAS,NOH,-6.0,-4.0,...,57.90,59.48,8.08,5.96,6.76,6.48,100.0,100.0,99.96,110.74
141,2007-11-09,188.0,186.0,0708,20071109NewOrleans,182.0,SAS,NOH,-6.0,-4.0,...,58.94,56.28,8.92,6.30,8.36,7.42,100.0,100.0,98.98,105.48
143,2007-11-09,210.5,209.5,0708,20071109Washington,210.0,DEN,WAS,-0.5,0.5,...,60.84,62.20,8.48,11.20,9.70,7.28,100.0,100.0,104.26,101.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,...,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,59.84,76.36,8.86,7.18,10.56,9.24,100.0,100.0,108.36,116.22
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224.0,TOR,GSW,13.0,12.5,...,77.92,58.72,6.60,8.50,8.90,11.10,100.0,100.0,115.12,108.88


Let's start with the "Four Factors". These are efficiency stats that author, Dean Oliver, identified as the most influential components to the outcome of a game, and cover 4 key areas of any basketball game: shooting (eFg), turnovers (tov), rebounding (orb), and free throws (ft_fga).

In [5]:
counter = 0
for column in df.columns:
    print(counter, '-',column)
    counter += 1

0 - Date
1 - O/U_open
2 - O/U_close
3 - Season
4 - id
5 - total
6 - away
7 - home
8 - ou1
9 - ou2
10 - Over/Under_open
11 - Over/Under_close
12 - team
13 - opp
14 - home/away
15 - pace
16 - pace_opp
17 - eFg
18 - eFg_opp
19 - tov
20 - tov_opp
21 - orb
22 - orb_opp
23 - ft_fga
24 - ft_fga_opp
25 - ortg
26 - ortg_opp
27 - fg
28 - fg_opp
29 - fga
30 - fga_opp
31 - fg_per
32 - fg_per_opp
33 - threes
34 - threes_opp
35 - threes_att
36 - threes_att_opp
37 - threes_per
38 - threes_per_opp
39 - ft
40 - ft_opp
41 - fta
42 - fta_opp
43 - ft_per
44 - ft_per_opp
45 - drb
46 - drb_opp
47 - trb
48 - trb_opp
49 - ast
50 - ast_opp
51 - stl
52 - stl_opp
53 - blk
54 - blk_opp
55 - to
56 - to_opp
57 - fouls
58 - fouls_opp
59 - ts_per
60 - ts_per_opp
61 - threes_ar
62 - threes_ar_opp
63 - ft_ar
64 - ft_ar_opp
65 - drb_per
66 - drb_per_opp
67 - trb_per
68 - trb_per_opp
69 - ast_per
70 - ast_per_opp
71 - stl_per
72 - stl_per_opp
73 - blk_per
74 - blk_per_opp
75 - user_per
76 - user_per_opp
77 - drtg
78 - dr

In [6]:
df1 = df.iloc[:,[10,11,83,84,85,86,87,88,89,90]]
df1

Unnamed: 0,Over/Under_open,Over/Under_close,eFg_rolling,eFg_opp_rolling,tov_rolling,tov_opp_rolling,orb_rolling,orb_opp_rolling,ft_fga_rolling,ft_fga_opp_rolling
129,1.0,1.0,0.4864,0.5326,11.22,17.08,20.94,24.04,0.1996,0.1922
135,1.0,1.0,0.5194,0.4724,12.08,11.92,22.56,29.06,0.3050,0.2180
140,0.0,0.0,0.5190,0.4644,12.94,12.78,26.88,23.76,0.2102,0.1528
141,0.0,0.0,0.5038,0.4786,12.44,14.82,21.28,23.64,0.2140,0.1810
143,0.0,1.0,0.4878,0.5144,14.98,14.62,23.20,26.26,0.2670,0.1836
...,...,...,...,...,...,...,...,...,...,...
29239,0.0,0.0,0.5388,0.5020,10.44,12.48,19.78,22.34,0.2708,0.2678
29240,0.0,0.0,0.5390,0.4996,10.98,13.60,19.56,22.62,0.2646,0.2726
29241,0.0,0.0,0.5086,0.5376,13.86,10.94,23.54,18.78,0.2444,0.2356
29242,1.0,1.0,0.5206,0.5242,14.30,11.20,20.50,22.12,0.2384,0.2708


Before we can do any data exploration, we need to split our dataset between training/validating & testing. This way we can avoid biasing our decision making on model development based off our testing data to more accurately guage how our model does on "fresh" data.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
#EXPLORING FOUR FACTORS FILTERED DF
df_train, df_test = train_test_split(df1,
                                     test_size=0.2, 
                                     random_state=42)

In [9]:
df_train

Unnamed: 0,Over/Under_open,Over/Under_close,eFg_rolling,eFg_opp_rolling,tov_rolling,tov_opp_rolling,orb_rolling,orb_opp_rolling,ft_fga_rolling,ft_fga_opp_rolling
28506,1.0,1.0,0.5336,0.5236,13.70,13.10,18.74,21.08,0.2148,0.1844
14784,1.0,1.0,0.4646,0.4630,11.88,16.36,23.46,14.26,0.1716,0.2664
24649,1.0,1.0,0.4796,0.5186,12.62,15.08,30.08,21.24,0.2128,0.1724
26216,1.0,1.0,0.4748,0.5260,14.40,13.58,20.18,23.86,0.1476,0.1998
3162,1.0,1.0,0.5284,0.5276,15.84,11.40,28.78,29.76,0.2096,0.2764
...,...,...,...,...,...,...,...,...,...,...
23075,1.0,1.0,0.4994,0.5020,12.80,10.06,25.40,23.76,0.1648,0.2180
5840,0.0,0.0,0.4058,0.4804,13.22,10.12,32.40,21.86,0.1756,0.2242
1010,1.0,1.0,0.4624,0.5638,13.18,14.40,32.68,24.86,0.2270,0.2304
16845,0.0,0.0,0.4708,0.4738,10.02,12.36,24.34,25.06,0.2616,0.2312


In [10]:
df_train.describe()

Unnamed: 0,Over/Under_open,Over/Under_close,eFg_rolling,eFg_opp_rolling,tov_rolling,tov_opp_rolling,orb_rolling,orb_opp_rolling,ft_fga_rolling,ft_fga_opp_rolling
count,21955.0,21955.0,21955.0,21955.0,21955.0,21955.0,21955.0,21955.0,21955.0,21955.0
mean,0.494512,0.493737,0.505249,0.504068,12.659002,12.668357,25.000869,24.968572,0.21799,0.2175
std,0.499981,0.499972,0.035335,0.033829,1.777912,1.82137,4.377192,3.987827,0.044492,0.043172
min,0.0,0.0,0.3712,0.37,6.08,6.0,9.94,10.0,0.0826,0.0896
25%,0.0,0.0,0.4811,0.481,11.44,11.42,22.0,22.24,0.1866,0.187
50%,0.0,0.0,0.5048,0.5036,12.64,12.6,24.9,24.94,0.215,0.2146
75%,1.0,1.0,0.5288,0.5267,13.84,13.88,27.92,27.62,0.246,0.2448
max,1.0,1.0,0.6462,0.6294,20.7,19.68,43.68,43.86,0.43,0.4326


As we can see, 2 of the Four Factors are in decimal form, and the other 2 are in percentage form. Let's standardize this with the StandardScalar package.

In [11]:
from sklearn.preprocessing import StandardScaler

In [13]:
scalar = StandardScaler()

In [None]:
#Pair plots to find some feature where we have some seperation. Starting with Four Factors...
sns.pairplot(df_train.iloc[:,[1,2,3]], corner=True, height=1.5, plot_kws={'size': 3}, hue='Over/Under_close');

In [None]:
sns.pairplot(df_train.iloc[:,[1,4,5]], corner=True, height=1.5, plot_kws={'size': 3}, hue='Over/Under_close');

## Simple Logistic Regression

Using average effective field goal percentage (eFg_per) of both teams versus the target.

In [None]:
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder

In [None]:
mask = df_train['Over/Under_open'] != 'P'
df_train[mask]

In [None]:
cat_X = df_train.loc[:,['Over/Under_close']]

In [None]:
cat_X

In [None]:
ohe = OneHotEncoder(drop='first', sparse=False)

ohe.fit(cat_X)

ohe_X = ohe.transform(cat_X)

columns = ohe.get_feature_names(['Over/Under_open'])

ohe_X_df = pd.DataFrame(ohe_X, columns=columns, index=cat_X.index)

ohe_X_df

In [None]:
cat_test_X = df_test.loc[:,['Over/Under_close']]

In [None]:
ohe = OneHotEncoder(drop='first', sparse=False)

ohe.fit(cat_test_X)

ohe_test_X = ohe.transform(cat_test_X)

columns = ohe.get_feature_names(['Over/Under_open'])

ohe_test_X_df = pd.DataFrame(ohe_test_X, columns=columns, index=cat_test_X.index)

ohe_test_X_df

In [None]:
# This will help with plotting. Look up Pandas.Categorical for more methods ...
df_train.loc[:, 'Over/Under_open'] = df_train['Over/Under_open'].astype('category')

In [None]:
df_train['Over/Under_open']

In [None]:
# For this first example, we'll employ statsmodels
lm_1 = sm.Logit(ohe_X_df,  # with statsmodels, `y` comes first
                sm.add_constant(df_train[['avg_eFg_per']]))  # and then `x`
lm_1 = lm_1.fit()

In [None]:
lm_1.summary()

In [None]:
# Let's go down the sklearn path, since this is better used in model pipelines
from sklearn.linear_model import LogisticRegression

In [None]:
# By default, LogisticRegression will fit an intercept, as we did with statsmodels
lm_1 = LogisticRegression(solver='newton-cg',  # For comparison, use the same solver as statsmodels default
                          C=100000)  # No regularization

lm_1.fit(df_train[['avg_eFg_per']], ohe_X_df)

In [None]:
# We can see that the coefficient in question matches the one from statsmodels
print('intercept: ', round(lm_1.intercept_[0], 4))
print('price_per_sqft coef: ', round(lm_1.coef_[0][0], 4))

In [None]:
df_eval = df_test.copy()
df_eval['pred'] = lm_1.predict(df_test[['avg_eFg_per']])
df_eval.loc[:, 'pred'] = df_eval['pred'].astype('category')
df_eval['correct_pred'] = df_eval['pred'] == ohe_test_X_df