In [69]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import sqlite3
from matplotlib import pyplot as plt
from datetime import date, datetime, timedelta
from collections import defaultdict
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [70]:
#import data
df = pd.read_csv('data.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,SEASON,TEAM_ID_HOME,GAME_ID,GAME_DATE,FGM_HOME,FGA_HOME,FG_PCT_HOME,FG3M_HOME,FG3A_HOME,FG3_PCT_HOME,...,TS_PCT_AWAY,E_PACE_AWAY,PACE_AWAY,PACE_PER40_AWAY,POSS_AWAY,PIE_AWAY,ELO_AWAY,RECENT_WIN_PCT_AWAY,REST_DAYS_AWAY,GAME_SPREAD
0,2000-01,1610612748,20000711,2001-02-13,32.566627,75.379888,0.431046,5.985385,16.841769,0.359586,...,0.499858,91.842791,90.24111,75.200835,91.17035,0.459733,1208.524359,0.3,3.0,2.0
1,2000-01,1610612757,20000721,2001-02-13,35.962227,77.211776,0.466868,4.220501,12.052706,0.353236,...,0.532081,94.340068,93.00216,77.501961,93.457272,0.539347,1385.270634,1.0,3.0,21.0
2,2000-01,1610612765,20000726,2001-02-14,35.465604,82.914284,0.430438,4.617979,12.315828,0.376773,...,0.500503,94.213314,93.28062,77.734403,93.687134,0.437982,1162.723408,0.3,1.0,19.0
3,2000-01,1610612761,20000729,2001-02-15,37.073126,84.315275,0.441021,5.20529,13.681417,0.375542,...,0.52414,89.148013,88.330002,73.608042,90.025072,0.53986,1397.788078,0.8,2.0,-7.0
4,2000-01,1610612760,20000734,2001-02-15,36.650991,80.249629,0.457681,5.45675,13.887473,0.404631,...,0.517887,95.509993,94.14342,78.45276,94.815955,0.483302,1242.003286,0.7,3.0,-20.0


In [71]:
df['SEASON'].unique()

array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06',
       '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
       '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18',
       '2018-19', '2019-20', '2020-21', '2021-22'], dtype=object)

In [72]:
#create training and test sets
test_seasons = ['2017-18',
                '2018-19',
                '2019-20',
                '2020-21',
                '2021-22']

train_df = df[~df['SEASON'].isin(test_seasons)].copy()
test_df = df[df['SEASON'].isin(test_seasons)].copy()

In [73]:
train_df.columns

Index(['SEASON', 'TEAM_ID_HOME', 'GAME_ID', 'GAME_DATE', 'FGM_HOME',
       'FGA_HOME', 'FG_PCT_HOME', 'FG3M_HOME', 'FG3A_HOME', 'FG3_PCT_HOME',
       'FTM_HOME', 'FTA_HOME', 'FT_PCT_HOME', 'OREB_HOME', 'DREB_HOME',
       'REB_HOME', 'AST_HOME', 'STL_HOME', 'BLK_HOME', 'TOV_HOME', 'PF_HOME',
       'PTS_HOME', 'PLUS_MINUS_HOME', 'E_OFF_RATING_HOME', 'OFF_RATING_HOME',
       'E_DEF_RATING_HOME', 'DEF_RATING_HOME', 'E_NET_RATING_HOME',
       'NET_RATING_HOME', 'AST_PCT_HOME', 'AST_TOV_HOME', 'AST_RATIO_HOME',
       'OREB_PCT_HOME', 'DREB_PCT_HOME', 'REB_PCT_HOME', 'E_TM_TOV_PCT_HOME',
       'TM_TOV_PCT_HOME', 'EFG_PCT_HOME', 'TS_PCT_HOME', 'E_PACE_HOME',
       'PACE_HOME', 'PACE_PER40_HOME', 'POSS_HOME', 'PIE_HOME', 'ELO_HOME',
       'RECENT_WIN_PCT_HOME', 'REST_DAYS_HOME', 'TEAM_ID_AWAY', 'FGM_AWAY',
       'FGA_AWAY', 'FG_PCT_AWAY', 'FG3M_AWAY', 'FG3A_AWAY', 'FG3_PCT_AWAY',
       'FTM_AWAY', 'FTA_AWAY', 'FT_PCT_AWAY', 'OREB_AWAY', 'DREB_AWAY',
       'REB_AWAY', 'AST_AWAY', 'S

In [74]:
#drop unnecessary columns
train_df.drop(columns=['SEASON',
                       'TEAM_ID_HOME',
                       'GAME_ID',
                       'GAME_DATE',
                       'TEAM_ID_AWAY'], inplace=True)
test_df.drop(columns=['SEASON',
                       'TEAM_ID_HOME',
                       'GAME_ID',
                       'GAME_DATE',
                       'TEAM_ID_AWAY'], inplace=True)

train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [75]:
test_df.head()

Unnamed: 0,FGM_HOME,FGA_HOME,FG_PCT_HOME,FG3M_HOME,FG3A_HOME,FG3_PCT_HOME,FTM_HOME,FTA_HOME,FT_PCT_HOME,OREB_HOME,...,TS_PCT_AWAY,E_PACE_AWAY,PACE_AWAY,PACE_PER40_AWAY,POSS_AWAY,PIE_AWAY,ELO_AWAY,RECENT_WIN_PCT_AWAY,REST_DAYS_AWAY,GAME_SPREAD
0,40.96511,83.948117,0.489223,13.676423,34.233172,0.398315,18.124519,23.845221,0.761012,8.950815,...,0.566162,98.244215,96.320377,80.267569,96.666855,0.492077,1540.735485,0.3,3.0,3.0
1,42.786234,86.941411,0.493566,12.430904,32.432811,0.384523,18.696132,23.329725,0.80443,9.806858,...,0.568099,102.025979,100.164395,83.470165,100.703324,0.487401,1596.369295,0.5,3.0,-1.0
2,40.381062,91.404003,0.44262,7.880397,24.430698,0.318337,13.335979,19.243777,0.706701,12.457688,...,0.552015,96.343427,94.650879,78.875791,95.251881,0.490943,1473.114932,0.4,3.0,12.0
3,39.926003,85.017516,0.470463,9.567652,24.508514,0.387135,16.889065,21.116228,0.796873,10.285383,...,0.550202,103.275096,101.70762,84.756484,101.841005,0.478798,1357.06919,0.5,3.0,9.0
4,38.87691,88.284417,0.440425,8.685238,26.505495,0.326234,17.322239,21.988664,0.786402,9.806072,...,0.560482,97.906908,96.011883,80.009967,96.494008,0.517589,1557.174778,0.6,3.0,7.0


In [76]:
#seperate target variable

y_train = train_df['GAME_SPREAD'].copy()
X_train = train_df.drop(columns=['GAME_SPREAD']).copy()

y_test = test_df['GAME_SPREAD'].copy()
X_test = test_df.drop(columns=['GAME_SPREAD']).copy()

In [77]:
#standardize data

sclr = StandardScaler()
X_train[list(X_train.columns)] = sclr.fit_transform(X_train)
X_test[list(X_test.columns)] = sclr.transform(X_test)

In [81]:
X_test.describe()

Unnamed: 0,FGM_HOME,FGA_HOME,FG_PCT_HOME,FG3M_HOME,FG3A_HOME,FG3_PCT_HOME,FTM_HOME,FTA_HOME,FT_PCT_HOME,OREB_HOME,...,EFG_PCT_AWAY,TS_PCT_AWAY,E_PACE_AWAY,PACE_AWAY,PACE_PER40_AWAY,POSS_AWAY,PIE_AWAY,ELO_AWAY,RECENT_WIN_PCT_AWAY,REST_DAYS_AWAY
count,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,...,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0,6248.0
mean,1.919925,2.032254,0.539805,2.631086,2.822359,0.226679,-0.54183,-0.722783,0.51248,-0.868998,...,1.563162,1.332381,2.198256,2.158062,2.158072,2.115369,-0.002797,0.319169,-0.002426,0.178764
std,0.907403,0.797106,0.923682,0.918641,0.92463,0.694973,0.827781,0.804391,0.969006,0.800916,...,0.84297,0.894437,0.812609,0.807071,0.807067,0.829004,0.825503,0.902249,0.980427,0.904723
min,-0.916815,-0.62629,-2.248546,0.001244,0.087278,-1.894518,-3.139963,-3.262962,-2.994884,-3.041267,...,-0.729819,-1.245666,-0.640929,-0.72654,-0.726594,-0.734422,-2.481531,-2.041655,-2.431727,-1.27152
25%,1.252303,1.517628,-0.138375,2.015465,2.218816,-0.240336,-1.103121,-1.290974,-0.150974,-1.404286,...,0.963716,0.68338,1.614939,1.593495,1.593518,1.548767,-0.524996,-0.299915,-0.496463,0.159525
50%,1.925031,2.034369,0.54267,2.578,2.79529,0.201432,-0.598741,-0.771335,0.565519,-0.896128,...,1.554758,1.317419,2.152052,2.110547,2.110625,2.05381,0.049565,0.384736,-0.012647,0.159525
75%,2.554178,2.572835,1.190459,3.21679,3.413197,0.666038,-0.029038,-0.194522,1.187983,-0.369619,...,2.127691,1.949902,2.743684,2.675309,2.675399,2.661745,0.570123,0.991004,0.471169,0.159525
max,4.491984,4.677838,3.845914,5.708858,5.937075,2.902231,2.236073,1.799964,3.447578,2.944122,...,4.175395,3.958179,4.549089,4.471545,4.471346,4.538606,2.413738,2.65862,2.406433,1.59057
