In [17]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import sqlite3
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
from collections import defaultdict
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [83]:
df = pd.read_csv('moving_average_df.csv')

In [84]:
#pull spreads data from test set for model evaluation
test_seasons = ['2021-22', '2020-21', '2019-20', '2018-19', '2017-18']
test_spreads_df = df[df['SEASON'].isin(test_seasons)][['GAME_ID','GAME_DATE','TEAM_ID_HOME','TEAM_CITY_HOME','SPREAD']]
test_spreads_df.head()

Unnamed: 0,GAME_ID,GAME_DATE,TEAM_ID_HOME,TEAM_CITY_HOME,SPREAD
0,42100404,2022-06-10,1610612738,Boston,-10
1,42100403,2022-06-08,1610612738,Boston,16
2,42100402,2022-06-05,1610612744,Golden State,19
3,42100401,2022-06-02,1610612744,Golden State,-12
4,42100307,2022-05-29,1610612748,Miami,-4


In [85]:
df.drop(columns=['Unnamed: 0',
                 'TEAM_ID_HOME',
                 'GAME_ID',
                 'GAME_DATE',
                 'TEAM_CITY_HOME',
                 'TEAM_ID_AWAY',
                 'TEAM_CITY_AWAY'], inplace=True)
df.head()

Unnamed: 0,SEASON,WL_HOME,FGM_HOME,FGA_HOME,FG_PCT_HOME,FG3M_HOME,FG3A_HOME,FG3_PCT_HOME,FTM_HOME,FTA_HOME,...,EFG_PCT_AWAY,TS_PCT_AWAY,USG_PCT_AWAY,E_USG_PCT_AWAY,E_PACE_AWAY,PACE_AWAY,PACE_PER40_AWAY,POSS_AWAY,PIE_AWAY,SPREAD
0,2021-22,0.65,37.0,81.65,0.45365,13.95,37.4,0.36795,18.9,23.5,...,0.5791,0.6056,1.0,0.1974,99.712,97.8,81.5,97.9,0.53445,-10
1,2021-22,0.65,36.95,81.65,0.4531,13.9,37.3,0.3676,19.0,23.5,...,0.57405,0.6022,1.0,0.1976,99.768,97.95,81.625,98.05,0.54245,16
2,2021-22,0.75,42.45,85.95,0.49605,13.9,35.6,0.39245,16.0,20.9,...,0.5461,0.5895,1.0,0.1988,98.211,96.125,80.1045,96.0,0.5505,19
3,2021-22,0.8,42.55,85.5,0.49985,13.75,35.05,0.3949,16.0,20.9,...,0.5454,0.5886,1.0,0.19885,98.51,96.625,80.521,96.5,0.54605,-12
4,2021-22,0.65,38.75,84.9,0.45885,12.15,35.7,0.34025,17.75,21.85,...,0.55085,0.5924,1.0,0.19855,98.577,96.65,80.542,96.5,0.55085,-4


In [86]:
#reserve last 5 seasons as test data
train_df = df[df['SEASON'].isin(test_seasons)==False]
test_df = df[df['SEASON'].isin(test_seasons)]

X_train = train_df.drop(columns=['SPREAD','SEASON'])
y_train = train_df['SPREAD']
X_test = test_df.drop(columns=['SPREAD', 'SEASON'])
y_test = test_df['SPREAD']

In [93]:
#Random Forrest implementation

#Baseline RF
rf = RandomForestRegressor()
#rf.fit(X_train, y_train)
#rf_baseline_score = rf.score(X_test, y_test)
#print('Random Forest baseline COD: ', rf_baseline_score)
rf_baseline_score = cross_val_score(rf, X_train, y_train, cv=5)
print('Random Forest baseline scores: ', rf_baseline_score)
print('Random Forest baseline avg. score: ', rf_baseline_score.mean())

Random Forest baseline scores:  [0.12432072 0.1421674  0.15611194 0.09194101 0.09459742]
Random Forest baseline avg. score:  0.12182769883607354


In [14]:
#Tuned RF
#create hyperparameter grid
rf = RandomForestRegressor()
n_estimators = [int(x) for x inn np.linspace(100, 1000, num=10)]
max_features =['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 50, num=10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 5]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=31, n_jobs=-1)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [87]:
spreads_df = pd.read_csv('spreads_df.csv')
spreads_df.drop(columns=['Unnamed: 0'], inplace=True)
spreads_df.head()

Unnamed: 0,GM_DATE,HOME_TEAM,SPREAD_1,ODDS_1,SPREAD_2,ODDS_2,SPREAD_3,ODDS_3,SPREAD_4,ODDS_4
0,2021-10-19,Milwaukee,+1½,-110,+2,-113,+2,-113,+2½,-135
1,2021-10-19,L.A. Lakers,+3,-110,+3,-112,+3,-112,+4,-135
2,2021-10-20,Charlotte,-1,100,+½,-110,+1½,-110,-1½,-110
3,2021-10-20,Detroit,-5,-110,-5,-112,-5,-112,-5,-110
4,2021-10-20,Toronto,+2½,-110,+2½,-109,+3,-109,+4½,-160


In [88]:
test_spreads_df['TEAM_CITY_HOME'].unique()

array(['Boston', 'Golden State', 'Miami', 'Dallas', 'Phoenix',
       'Milwaukee', 'Philadelphia', 'Memphis', 'Minnesota', 'Utah',
       'New Orleans', 'Toronto', 'Brooklyn', 'Atlanta', 'Denver',
       'Chicago', 'Houston', 'LA', 'Portland', 'Cleveland', 'Charlotte',
       'Orlando', 'New York', 'San Antonio', 'Washington', 'Los Angeles',
       'Detroit', 'Sacramento', 'Oklahoma City', 'Indiana'], dtype=object)

In [89]:
spreads_df['HOME_TEAM'].unique()

array(['Milwaukee', 'L.A. Lakers', 'Charlotte', 'Detroit', 'Toronto',
       'New York', 'Memphis', 'New Orleans', 'Minnesota', 'San Antonio',
       'Utah', 'Portland', 'Phoenix', 'Atlanta', 'Miami', 'Golden State',
       'Philadelphia', 'Cleveland', 'Indiana', 'Chicago', 'L.A. Clippers',
       'Brooklyn', 'Houston', 'Oklahoma City', 'Sacramento', 'Denver',
       'Dallas', 'Boston', 'Orlando', 'Washington'], dtype=object)

In [90]:
test_spreads_df['TEAM_CITY_HOME'] = test_spreads_df['TEAM_CITY_HOME'].map({'LA':'L.A. Clippers', 
                                                                           'Los Angeles':'L.A. Lakers',
                                                                           'Boston': 'Boston',
                                                                           'Golden State': 'Golden State',
                                                                           'Miami': 'Miami',
                                                                           'Dallas':'Dallas',
                                                                           'Phoenix':'Phoenix',
                                                                           'Milwaukee':'Milwaukee',
                                                                           'Philadelphia':'Philadelphia',
                                                                           'Memphis':'Memphis',
                                                                           'Minnesota':'Minnesota',
                                                                           'Utah':'Utah',
                                                                           'New Orleans':'New Orleans',
                                                                           'Toronto':'Toronto',
                                                                           'Brooklyn':'Brooklyn',
                                                                           'Atlanta':'Atlanta',
                                                                           'Denver':'Denver',
                                                                           'Chicago':'Chicago',
                                                                           'Houston':'Houston',
                                                                           'Portland':'Portland',
                                                                           'Cleveland':'Cleveland',
                                                                           'Charlotte':'Charlotte',
                                                                           'Orlando':'Orlando',
                                                                           'New York':'New York',
                                                                           'San Antonio':'San Antonio',
                                                                           'Washington':'Washington',
                                                                           'Detroit':'Detroit',
                                                                           'Sacramento':'Sacramento',
                                                                           'Oklahoma City':'Oklahoma City',
                                                                           'Indiana':'Indiana'
                                                                          })
test_spreads_df.rename(columns={'TEAM_CITY_HOME':'HOME_TEAM', 'GAME_DATE':'GM_DATE'}, inplace=True)
test_spreads_df = test_spreads_df.merge(spreads_df, how='inner', on=['HOME_TEAM','GM_DATE'])
test_spreads_df.head()

Unnamed: 0,GAME_ID,GM_DATE,TEAM_ID_HOME,HOME_TEAM,SPREAD,SPREAD_1,ODDS_1,SPREAD_2,ODDS_2,SPREAD_3,ODDS_3,SPREAD_4,ODDS_4
0,42100404,2022-06-10,1610612738,Boston,-10,+4,-110,+4,-113,,-,+3½,-111
1,42100403,2022-06-08,1610612738,Boston,16,+3½,-110,+3½,-113,,-,+3½,-125
2,42100402,2022-06-05,1610612744,Golden State,19,+5,-110,+5,-113,,-,+4½,-118
3,42100401,2022-06-02,1610612744,Golden State,-12,+3½,-110,+4,-114,,-,+3½,-111
4,42100307,2022-05-29,1610612748,Miami,-4,-3,-110,-3,-112,,-,-2½,-125


In [92]:
test_spreads_df['HOME_TEAM'].unique()

array(['Boston', 'Golden State', 'Miami', 'Dallas', 'Phoenix',
       'Milwaukee', 'Philadelphia', 'Memphis', 'Minnesota', 'Utah',
       'New Orleans', 'Toronto', 'Brooklyn', 'Atlanta', 'Denver',
       'Chicago', 'Houston', 'L.A. Clippers', 'Portland', 'Cleveland',
       'Charlotte', 'Orlando', 'New York', 'San Antonio', 'Washington',
       'L.A. Lakers', 'Detroit', 'Sacramento', 'Oklahoma City', 'Indiana'],
      dtype=object)

In [94]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [95]:
spread_preds = rf.predict(X_test)
spread_preds

array([ 0.  , -1.26,  4.04, ..., -0.15,  4.6 , 18.24])

In [96]:
test_spreads_df['SPREAD_PREDS'] = spread_preds
test_spreads_df.head()

ValueError: Length of values (6246) does not match length of index (8683)

In [None]:
1610612747 lakers
1610612746 clippers