In [49]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import sqlite3
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
from collections import defaultdict
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [40]:
df = pd.read_csv('moving_average_df.csv')

In [41]:
print(df[df['GAME_ID']==42100404]['PTS_HOME'])
print(df[df['GAME_ID']==42100404]['PTS_AWAY'])
print(df[df['GAME_ID']==42100404]['SPREAD'])

0    106.85
Name: PTS_HOME, dtype: float64
0    113.75
Name: PTS_AWAY, dtype: float64
0   -10
Name: SPREAD, dtype: int64


In [42]:
df.drop(columns=['Unnamed: 0',
                 'TEAM_ID_HOME',
                 'GAME_ID',
                 'GAME_DATE',
                 'TEAM_CITY_HOME',
                 'TEAM_ID_AWAY',
                 'TEAM_CITY_AWAY'], inplace=True)
df.head()

Unnamed: 0,SEASON,WL_HOME,FGM_HOME,FGA_HOME,FG_PCT_HOME,FG3M_HOME,FG3A_HOME,FG3_PCT_HOME,FTM_HOME,FTA_HOME,...,EFG_PCT_AWAY,TS_PCT_AWAY,USG_PCT_AWAY,E_USG_PCT_AWAY,E_PACE_AWAY,PACE_AWAY,PACE_PER40_AWAY,POSS_AWAY,PIE_AWAY,SPREAD
0,2021-22,0.65,37.0,81.65,0.45365,13.95,37.4,0.36795,18.9,23.5,...,0.5791,0.6056,1.0,0.1974,99.712,97.8,81.5,97.9,0.53445,-10
1,2021-22,0.65,36.95,81.65,0.4531,13.9,37.3,0.3676,19.0,23.5,...,0.57405,0.6022,1.0,0.1976,99.768,97.95,81.625,98.05,0.54245,16
2,2021-22,0.75,42.45,85.95,0.49605,13.9,35.6,0.39245,16.0,20.9,...,0.5461,0.5895,1.0,0.1988,98.211,96.125,80.1045,96.0,0.5505,19
3,2021-22,0.8,42.55,85.5,0.49985,13.75,35.05,0.3949,16.0,20.9,...,0.5454,0.5886,1.0,0.19885,98.51,96.625,80.521,96.5,0.54605,-12
4,2021-22,0.65,38.75,84.9,0.45885,12.15,35.7,0.34025,17.75,21.85,...,0.55085,0.5924,1.0,0.19855,98.577,96.65,80.542,96.5,0.55085,-4


In [43]:
df.columns

Index(['SEASON', 'WL_HOME', 'FGM_HOME', 'FGA_HOME', 'FG_PCT_HOME', 'FG3M_HOME',
       'FG3A_HOME', 'FG3_PCT_HOME', 'FTM_HOME', 'FTA_HOME', 'FT_PCT_HOME',
       'OREB_HOME', 'DREB_HOME', 'REB_HOME', 'AST_HOME', 'STL_HOME',
       'BLK_HOME', 'TOV_HOME', 'PF_HOME', 'PTS_HOME', 'PLUS_MINUS_HOME',
       'E_OFF_RATING_HOME', 'OFF_RATING_HOME', 'E_DEF_RATING_HOME',
       'DEF_RATING_HOME', 'E_NET_RATING_HOME', 'NET_RATING_HOME',
       'AST_PCT_HOME', 'AST_TOV_HOME', 'AST_RATIO_HOME', 'OREB_PCT_HOME',
       'DREB_PCT_HOME', 'REB_PCT_HOME', 'E_TM_TOV_PCT_HOME', 'TM_TOV_PCT_HOME',
       'EFG_PCT_HOME', 'TS_PCT_HOME', 'USG_PCT_HOME', 'E_USG_PCT_HOME',
       'E_PACE_HOME', 'PACE_HOME', 'PACE_PER40_HOME', 'POSS_HOME', 'PIE_HOME',
       'WL_AWAY', 'FGM_AWAY', 'FGA_AWAY', 'FG_PCT_AWAY', 'FG3M_AWAY',
       'FG3A_AWAY', 'FG3_PCT_AWAY', 'FTM_AWAY', 'FTA_AWAY', 'FT_PCT_AWAY',
       'OREB_AWAY', 'DREB_AWAY', 'REB_AWAY', 'AST_AWAY', 'STL_AWAY',
       'BLK_AWAY', 'TOV_AWAY', 'PF_AWAY', 'PTS_AWA

In [44]:
#reserve last 5 seasons as test data
test_seasons = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
train_df = df[df['SEASON'].isin(test_seasons)==False]
test_df = df[df['SEASON'].isin(test_seasons)]

X_train = train_df.drop(columns=['SPREAD','SEASON'])
y_train = train_df['SPREAD']
X_test = test_df.drop(columns=['SPREAD', 'SEASON'])
y_test = test_df['SPREAD']

In [45]:
#Random Forrest implementation

#Baseline RF
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_baseline_score = rf.score(X_test, y_test)
print('Random Forest baseline COD: ', rf_baseline_score)

Random Forest baseline COD:  0.09459478442637159


In [50]:
rf_baseline_score = cross_val_score(rf, X_train, y_train, cv=5)
print('Random Forest baseline CV score: ', rf_baseline_score)

Random Forest baseline CV score:  [0.125036   0.14124014 0.15217001 0.09404082 0.09672682]


In [None]:
#Tuned RF

In [47]:
for i, x in enumerate(rf.feature_importances_):
    print('Feature %0i Score: %0.3f' % (i, x))

Feature 0 Score: 0.006
Feature 1 Score: 0.009
Feature 2 Score: 0.010
Feature 3 Score: 0.011
Feature 4 Score: 0.010
Feature 5 Score: 0.011
Feature 6 Score: 0.015
Feature 7 Score: 0.011
Feature 8 Score: 0.012
Feature 9 Score: 0.016
Feature 10 Score: 0.010
Feature 11 Score: 0.011
Feature 12 Score: 0.011
Feature 13 Score: 0.010
Feature 14 Score: 0.014
Feature 15 Score: 0.014
Feature 16 Score: 0.010
Feature 17 Score: 0.014
Feature 18 Score: 0.007
Feature 19 Score: 0.026
Feature 20 Score: 0.007
Feature 21 Score: 0.008
Feature 22 Score: 0.008
Feature 23 Score: 0.008
Feature 24 Score: 0.036
Feature 25 Score: 0.024
Feature 26 Score: 0.012
Feature 27 Score: 0.012
Feature 28 Score: 0.010
Feature 29 Score: 0.012
Feature 30 Score: 0.014
Feature 31 Score: 0.013
Feature 32 Score: 0.008
Feature 33 Score: 0.007
Feature 34 Score: 0.009
Feature 35 Score: 0.009
Feature 36 Score: 0.000
Feature 37 Score: 0.014
Feature 38 Score: 0.008
Feature 39 Score: 0.004
Feature 40 Score: 0.004
Feature 41 Score: 0.007
Fe

In [33]:
X_train.columns[18]

'PTS_HOME'