In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error


from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams

# Gets the box score of every single game (NBA + WNBA + others: nba.com doesn't seperate the leagues) 
# from 2014-2021 into a dataframe.
all_games_finder = leaguegamefinder.LeagueGameFinder()
all_games = all_games_finder.get_data_frames()[0]

# Creates empty dataframe with the same column names passed in from the dataframe containing all the games.
column_names = all_games.columns
games = pd.DataFrame(columns = column_names)

# Appends every game containing an NBA team in the all_games df to the games df. 
nba_teams = teams.get_teams()
for team in nba_teams:
    temp_id = team['id']
    games = games.append(all_games[all_games['TEAM_ID'] == temp_id]).reset_index(drop = True)

pd.set_option('display.max_columns', None)
games.head(60)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22021,1610612737,ATL,Atlanta Hawks,22100319,2021-12-01,ATL @ IND,W,240,114,44,86,0.512,16,33,0.485,10,12,0.833,7,34,41,24,4,8,13,15,3.0
1,22021,1610612737,ATL,Atlanta Hawks,22100293,2021-11-27,ATL vs. NYK,L,240,90,33,93,0.355,9,37,0.243,15,20,0.75,13,39,52,18,8,6,6,17,-9.0
2,22021,1610612737,ATL,Atlanta Hawks,22100285,2021-11-26,ATL @ MEM,W,239,132,52,89,0.584,13,27,0.481,15,21,0.714,9,40,49,33,8,5,12,15,32.0
3,22021,1610612737,ATL,Atlanta Hawks,22100277,2021-11-24,ATL @ SAS,W,239,124,45,88,0.511,12,26,0.462,22,24,0.917,8,36,44,26,10,5,9,11,18.0
4,22021,1610612737,ATL,Atlanta Hawks,22100255,2021-11-22,ATL vs. OKC,W,239,113,42,87,0.483,14,34,0.412,15,16,0.938,8,36,44,25,6,6,7,16,12.0
5,22021,1610612737,ATL,Atlanta Hawks,22100242,2021-11-20,ATL vs. CHA,W,241,115,43,82,0.524,12,34,0.353,17,21,0.81,8,38,46,24,6,6,12,22,10.0
6,22021,1610612737,ATL,Atlanta Hawks,22100215,2021-11-17,ATL vs. BOS,W,240,110,41,81,0.506,13,37,0.351,15,18,0.833,6,34,40,28,9,4,11,17,11.0
7,22021,1610612737,ATL,Atlanta Hawks,22100202,2021-11-15,ATL vs. ORL,W,239,129,47,85,0.553,14,30,0.467,21,32,0.656,9,37,46,32,10,7,11,17,18.0
8,22021,1610612737,ATL,Atlanta Hawks,22100193,2021-11-14,ATL vs. MIL,W,241,120,47,97,0.485,15,35,0.429,11,13,0.846,15,36,51,21,6,1,12,19,20.0
9,22021,1610612737,ATL,Atlanta Hawks,22100182,2021-11-12,ATL @ DEN,L,238,96,38,93,0.409,5,28,0.179,15,21,0.714,14,36,50,20,2,5,8,20,-9.0


In [2]:
# To-do: get rid of summer league/playoff games


# Dropping any game (two rows in DF) that has any NaN values or is missing either team's stats
games.isna()
games.dropna(inplace=True)

games = games[games.duplicated(subset = ['GAME_ID'], keep=False)]

# Merging games together (previously seperated in the DF by team: each team's stats from the game were kept in seperate rows
games = games.sort_values(by=['GAME_DATE'])
games = games.reset_index(drop=True)

# Team A and B each have a row for their stats in a given matchup; we need to add both stats to the end of their respective rows
# Team A dataframe
tempA = games[games.index % 2 == 0]
tempA2 = games[games.index % 2 == 1]

tempA2 = tempA2.add_prefix('OPP_')

tempA = tempA.reset_index(drop=True)
tempA2 = tempA2.reset_index(drop=True)

a_temp = tempA.join(tempA2)

# Team B dataframe
tempB = games[games.index % 2 == 0]
tempB2 = games[games.index % 2 == 1]

tempB = tempB.add_prefix('OPP_')

tempB = tempB.reset_index(drop=True)
tempB2 = tempB2.reset_index(drop=True)

b_temp = tempB2.join(tempB)

# Adding both teams to main dataframe
games = a_temp.append(b_temp)

# Resorting main dataframe
games = games.sort_values(by=['GAME_DATE'])
games = games.reset_index(drop=True)

# Sending data to CSV
games.to_csv('games.csv', index = False)



# Print Head
pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_SEASON_ID,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,OPP_TEAM_NAME,OPP_GAME_ID,OPP_GAME_DATE,OPP_MATCHUP,OPP_WL,OPP_MIN,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS
0,22015,1610612766,CHA,Charlotte Hornets,1421500004,2015-07-04,CHA @ OKC,L,199,74,26,71,0.366,9,30,0.3,13,17,0.765,13,22,35,17,6,2,13,18,-2.0,22015,1610612763,MEM,Memphis Grizzlies,1421500005,2015-07-04,MEM @ BKN,W,200,81,25,63,0.397,4,12,0.333,27,33,0.818,10,25,35,14,9,1,13,23,8.0
1,22015,1610612748,MIA,Miami Heat,1421500001,2015-07-04,MIA @ IND,W,202,92,30,66,0.455,7,18,0.389,25,32,0.781,12,19,31,20,11,6,15,24,16.0,22015,1610612751,BKN,Brooklyn Nets,1421500005,2015-07-04,BKN vs. MEM,L,201,73,29,68,0.426,6,24,0.25,9,17,0.529,10,26,36,15,6,4,19,27,-8.0
2,22015,1610612754,IND,Indiana Pacers,1421500001,2015-07-04,IND vs. MIA,L,201,76,29,64,0.453,9,18,0.5,9,11,0.818,10,22,32,21,6,4,25,28,-16.0,22015,1610612760,OKC,Oklahoma City Thunder,1421500004,2015-07-04,OKC vs. CHA,W,200,76,28,66,0.424,6,19,0.316,14,23,0.609,12,30,42,20,7,2,17,20,2.0
3,22015,1610612763,MEM,Memphis Grizzlies,1421500005,2015-07-04,MEM @ BKN,W,200,81,25,63,0.397,4,12,0.333,27,33,0.818,10,25,35,14,9,1,13,23,8.0,22015,1610612766,CHA,Charlotte Hornets,1421500004,2015-07-04,CHA @ OKC,L,199,74,26,71,0.366,9,30,0.3,13,17,0.765,13,22,35,17,6,2,13,18,-2.0
4,22015,1610612760,OKC,Oklahoma City Thunder,1421500004,2015-07-04,OKC vs. CHA,W,200,76,28,66,0.424,6,19,0.316,14,23,0.609,12,30,42,20,7,2,17,20,2.0,22015,1610612754,IND,Indiana Pacers,1421500001,2015-07-04,IND vs. MIA,L,201,76,29,64,0.453,9,18,0.5,9,11,0.818,10,22,32,21,6,4,25,28,-16.0


In [3]:
# Sorting the DF by teams
temp_games = pd.DataFrame()
nba_teams = teams.get_teams()
for team in nba_teams:
    temp_id = team['id']
    temp_games = temp_games.append(games[games['TEAM_ID'] == temp_id]).reset_index(drop=True)

games = temp_games

# Print Head
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
games.head(100)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_SEASON_ID,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,OPP_TEAM_NAME,OPP_GAME_ID,OPP_GAME_DATE,OPP_MATCHUP,OPP_WL,OPP_MIN,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS
0,22015,1610612737,ATL,Atlanta Hawks,1521500003,2015-07-10,ATL @ DEN,L,201,71,29,70,0.414,6,22,0.273,7,8,0.875,9,28,37,13,8,4,22,29,-15.0,22015,1610612739,CLE,Cleveland Cavaliers,1521500006,2015-07-10,CLE vs. GSW,L,200,75,25,63,0.397,6,22,0.273,19,23,0.826,9,25,34,11,4,7,16,39,-7.8
1,22015,1610612737,ATL,Atlanta Hawks,1521500020,2015-07-12,ATL vs. GSW,W,200,71,28,63,0.444,6,22,0.273,9,22,0.409,2,33,35,12,12,12,20,33,1.0,22015,1610612744,GSW,Golden State Warriors,1521500020,2015-07-12,GSW @ ATL,L,200,70,22,69,0.319,5,18,0.278,21,32,0.656,14,27,41,11,9,3,15,16,-1.0
2,22015,1610612737,ATL,Atlanta Hawks,1521500039,2015-07-15,ATL vs. MIA,W,200,75,28,65,0.431,3,14,0.214,16,18,0.889,12,32,44,17,7,7,20,28,11.0,22015,1610612741,CHI,Chicago Bulls,1521500043,2015-07-15,CHI @ CLE,W,200,91,36,74,0.486,8,16,0.5,11,12,0.917,9,32,41,22,6,4,14,20,18.0
3,22015,1610612737,ATL,Atlanta Hawks,1521500050,2015-07-16,ATL @ DEN,W,199,82,31,75,0.413,10,22,0.455,10,14,0.714,8,29,37,19,15,2,17,22,9.0,22015,1610612764,WAS,Washington Wizards,1521500049,2015-07-16,WAS @ NOP,L,199,81,33,75,0.44,4,19,0.211,11,13,0.846,11,25,36,18,11,3,18,19,-16.0
4,22015,1610612737,ATL,Atlanta Hawks,1521500062,2015-07-18,ATL vs. DAL,W,200,91,31,64,0.484,8,21,0.381,21,27,0.778,9,31,40,17,6,5,17,15,8.0,22015,1610612741,CHI,Chicago Bulls,1521500063,2015-07-18,CHI @ PHX,L,199,84,29,72,0.403,9,27,0.333,17,22,0.773,17,33,50,13,3,7,23,34,-7.0
5,22015,1610612737,ATL,Atlanta Hawks,1521500065,2015-07-19,ATL @ SAS,L,199,68,23,71,0.324,5,26,0.192,17,23,0.739,14,27,41,12,9,7,18,24,-7.0,22015,1610612756,PHX,Phoenix Suns,1521500066,2015-07-19,PHX @ NOP,W,199,93,37,72,0.514,11,27,0.407,8,10,0.8,6,30,36,20,8,2,17,21,6.0
6,12015,1610612737,ATL,Atlanta Hawks,11500017,2015-10-07,ATL @ CLE,W,238,98,33,75,0.44,7,24,0.292,25,30,0.833,9,37,46,17,8,2,17,19,2.0,12015,1610612747,LAL,Los Angeles Lakers,11500010,2015-10-07,LAL vs. UTA,L,265,114,36,83,0.434,7,26,0.269,35,46,0.761,14,27,41,16,9,4,19,31,-3.0
7,12015,1610612737,ATL,Atlanta Hawks,11500032,2015-10-09,ATL @ NOP,W,240,103,33,74,0.446,11,26,0.423,26,36,0.722,5,43,48,21,8,5,16,23,10.0,12015,1610612756,PHX,Phoenix Suns,11500035,2015-10-09,PHX vs. UTA,W,241,101,41,92,0.446,7,21,0.333,12,17,0.706,12,32,44,20,11,6,9,24,16.0
8,12015,1610612737,ATL,Atlanta Hawks,11500060,2015-10-14,ATL vs. SAS,W,240,100,32,78,0.41,11,31,0.355,25,29,0.862,3,45,48,24,10,4,16,26,14.0,12015,1610612738,BOS,Boston Celtics,11500059,2015-10-14,BOS @ BKN,W,239,109,40,85,0.471,13,26,0.5,16,22,0.727,7,33,40,28,12,9,14,26,4.0
9,12015,1610612737,ATL,Atlanta Hawks,11500068,2015-10-16,ATL @ DAL,W,241,91,31,84,0.369,9,28,0.321,20,24,0.833,9,40,49,18,9,7,15,14,7.0,12015,1610612738,BOS,Boston Celtics,11500067,2015-10-16,BOS @ NYK,L,239,95,33,83,0.398,8,21,0.381,21,30,0.7,11,46,57,20,4,7,18,21,-6.0


In [4]:
# Storing some columns for future use
game_ids = games['GAME_ID'].values
team_ids = games['TEAM_ID'].values
minutes = games['MIN'].values
abrv = games['TEAM_ABBREVIATION'].values
opp_abrv = games['OPP_TEAM_ABBREVIATION'].values
spread = games['PLUS_MINUS'].values

# Dropping Non-essential categorical data
games = games.drop(columns=['SEASON_ID', 'OPP_SEASON_ID', 'OPP_TEAM_ID','GAME_ID', 'OPP_GAME_ID', 'TEAM_ABBREVIATION', 'OPP_TEAM_ABBREVIATION', 'TEAM_NAME', 'OPP_TEAM_NAME', 'MATCHUP', 'OPP_MATCHUP', 'WL', 'OPP_WL', 'GAME_DATE', 'OPP_GAME_DATE', 'MIN', 'OPP_MIN']) 

# Adding minutes back as a single column
games['MIN'] = minutes

# Print Head
pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,TEAM_ID,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS,MIN
0,1610612737,71,29,70,0.414,6,22,0.273,7,8,0.875,9,28,37,13,8,4,22,29,-15.0,75,25,63,0.397,6,22,0.273,19,23,0.826,9,25,34,11,4,7,16,39,-7.8,201
1,1610612737,71,28,63,0.444,6,22,0.273,9,22,0.409,2,33,35,12,12,12,20,33,1.0,70,22,69,0.319,5,18,0.278,21,32,0.656,14,27,41,11,9,3,15,16,-1.0,200
2,1610612737,75,28,65,0.431,3,14,0.214,16,18,0.889,12,32,44,17,7,7,20,28,11.0,91,36,74,0.486,8,16,0.5,11,12,0.917,9,32,41,22,6,4,14,20,18.0,200
3,1610612737,82,31,75,0.413,10,22,0.455,10,14,0.714,8,29,37,19,15,2,17,22,9.0,81,33,75,0.44,4,19,0.211,11,13,0.846,11,25,36,18,11,3,18,19,-16.0,199
4,1610612737,91,31,64,0.484,8,21,0.381,21,27,0.778,9,31,40,17,6,5,17,15,8.0,84,29,72,0.403,9,27,0.333,17,22,0.773,17,33,50,13,3,7,23,34,-7.0,200


In [5]:
# Adding Advanced Stats to enhance model performance, formulas were gathered from various sources. 

# Efficient Field Goal Percentage
games['EFG%'] = (games['FGM'] + (.5 * games['FG3M'])) / games['FGA']
games['OPP_EFG%'] = (games['OPP_FGM'] + (.5 * games['OPP_FG3M'])) / games['OPP_FGA']

# Block Percentage
games['BLK%'] = (games['BLK'] / (games['OPP_FGA']-games['OPP_FG3A']))
games['OPP_BLK%'] = (games['OPP_BLK'] / (games['FGA']-games['FG3A']))

# Turnover Percentage
games['TOV%'] = games['TOV'] / (games['FGA'] + 0.44 * games['FTA'] + games['TOV'])
games['OPP_TOV%'] = games['OPP_TOV'] / (games['OPP_FGA'] + 0.44 * games['OPP_FTA'] + games['OPP_TOV'])

#Offensive Rebound Percentage
games['ORB%'] = games['OREB'] / (games['OREB'] + games['OPP_DREB'])
games['OPP_ORB%'] = games['OPP_OREB'] / (games['OPP_OREB'] + games['DREB'])

#Defensive Rebound Percentage
games['DREB%'] = games['DREB'] / (games['OPP_OREB'] + games['DREB'])
games['OPP_DREB%'] = games['OPP_DREB'] / (games['OREB'] + games['OPP_DREB'])

# Possessions
#games["POSS"] = 0.5*((games["FGA"] + 0.4*games["FTA"] - 1.07*(games["OREB"]/(games["OREB"] + games["OREB_other"])) * (games["FGA"] - games["FGM"]) + games["TOV"]) + games["FGA_other"] + 0.4*games["FTA_other"] - 1.07 * (games["OREB_other"] / (games["OREB_other"] + games["DREB"])) * (games["FGA_other"] - games["FGM_other]) + games["TOV_other"]))

# ***Placeholder POSS***
games['POSS'] = 0.96*((games['FGA']) + games['TOV'] + 0.44 * games['FTA'] - games['OREB'])
games['OPP_POSS'] = 0.96*((games['OPP_FGA']) + games['OPP_TOV'] + 0.44 * games['OPP_FTA'] - games['OPP_OREB'])

# Steals Percentage
games['STL%'] = (games['STL'] / games['OPP_POSS'])
games['OPP_STL%'] = (games['OPP_STL'] / games['POSS'])

# Free Throw Rate
games['FTR'] = games['FTM'] / games['FGA']
games['OPP_FTR'] = games['OPP_FTM'] / games['OPP_FGA']

# True Shooting (Requires True Shooting Attempts)
tsa = games['FGA'] + 0.44 * games['FTA']
OPP_tsa = games['OPP_FGA'] + 0.44 * games['OPP_FTA']
games['TS'] = games['PTS'] / (2 * tsa)
games['OPP_TS'] = games['OPP_PTS'] / (2 * OPP_tsa)

# Assist Rate
games['ASTR'] = games['AST'] / (games['FGA'] + (.44 * games['FTA']) + games['AST'] + games['TOV'])
games['OPP_ASTR'] = games['OPP_AST'] / (games['OPP_FGA'] + (.44 * games['OPP_FTA']) + games['OPP_AST'] + games['OPP_TOV'])

# Total Rebound Percentage
games['TRB%'] = (games['REB'] * (games['REB'] / 5)) / (games['MIN'] * (games['REB'] + games['OPP_REB']))
games['OPP_TRB%'] = (games['OPP_REB'] * (games['OPP_REB'] / 5)) / (games['MIN'] * (games['OPP_REB'] + games['REB']))

# PACE
games['PACE'] = 48 * (games['POSS'] + games['OPP_POSS']) / (2 * (games['MIN'] / 5))
games['OPP_PACE'] = 48 * (games['OPP_POSS'] + games['POSS']) / (2 * (games['MIN'] / 5))

# Offensive Rating
games['ORTG'] = (games['PTS'] / games['POSS'])
games['OPP_ORTG'] = (games['OPP_PTS'] / games['OPP_POSS'])

# Defensive Rating
games['DRTG'] = (games['OPP_PTS'] / games['POSS'])
games['OPP_DRTG'] = (games['PTS'] / games['OPP_POSS'])

pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,TEAM_ID,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS,MIN,EFG%,OPP_EFG%,BLK%,OPP_BLK%,TOV%,OPP_TOV%,ORB%,OPP_ORB%,DREB%,OPP_DREB%,POSS,OPP_POSS,STL%,OPP_STL%,FTR,OPP_FTR,TS,OPP_TS,ASTR,OPP_ASTR,TRB%,OPP_TRB%,PACE,OPP_PACE,ORTG,OPP_ORTG,DRTG,OPP_DRTG
0,1610612737,71,29,70,0.414,6,22,0.273,7,8,0.875,9,28,37,13,8,4,22,29,-15.0,75,25,63,0.397,6,22,0.273,19,23,0.826,9,25,34,11,4,7,16,39,-7.8,201,0.457143,0.444444,0.097561,0.145833,0.230318,0.179533,0.264706,0.243243,0.756757,0.735294,83.0592,76.9152,0.104011,0.048158,0.1,0.301587,0.482862,0.512856,0.119794,0.109868,0.019186,0.016201,95.507104,95.507104,0.854812,0.9751,0.90297,0.923095
1,1610612737,71,28,63,0.444,6,22,0.273,9,22,0.409,2,33,35,12,12,12,20,33,1.0,70,22,69,0.319,5,18,0.278,21,32,0.656,14,27,41,11,9,3,15,16,-1.0,200,0.492063,0.355072,0.235294,0.073171,0.215796,0.152936,0.068966,0.297872,0.702128,0.931034,87.0528,80.7168,0.148668,0.103386,0.142857,0.304348,0.488442,0.421281,0.114635,0.100843,0.016118,0.022118,100.66176,100.66176,0.815597,0.86723,0.80411,0.879619
2,1610612737,75,28,65,0.431,3,14,0.214,16,18,0.889,12,32,44,17,7,7,20,28,11.0,91,36,74,0.486,8,16,0.5,11,12,0.917,9,32,41,22,6,4,14,20,18.0,200,0.453846,0.540541,0.12069,0.078431,0.215239,0.150086,0.272727,0.219512,0.780488,0.727273,77.6832,80.9088,0.086517,0.077237,0.246154,0.148649,0.514262,0.573915,0.154658,0.19084,0.022776,0.019776,95.1552,95.1552,0.96546,1.124723,1.171424,0.92697
3,1610612737,82,31,75,0.413,10,22,0.455,10,14,0.714,8,29,37,19,15,2,17,22,9.0,81,33,75,0.44,4,19,0.211,11,13,0.846,11,25,36,18,11,3,18,19,-16.0,199,0.48,0.466667,0.035714,0.056604,0.173187,0.182334,0.242424,0.275,0.725,0.757576,86.5536,84.2112,0.178124,0.127089,0.133333,0.146667,0.505175,0.501734,0.162171,0.154215,0.018848,0.017843,102.973749,102.973749,0.94739,0.961867,0.935836,0.973742
4,1610612737,91,31,64,0.484,8,21,0.381,21,27,0.778,9,31,40,17,6,5,17,15,8.0,84,29,72,0.403,9,27,0.333,17,22,0.773,17,33,50,13,3,7,23,34,-7.0,200,0.546875,0.465278,0.111111,0.162791,0.183032,0.219717,0.214286,0.354167,0.645833,0.785714,80.5248,84.1728,0.071282,0.037256,0.328125,0.236111,0.599631,0.514202,0.154714,0.110469,0.017778,0.027778,98.81856,98.81856,1.130087,0.997947,1.043157,1.081109


In [6]:
# Getting rolling average of recent x games
games_averages = games.copy() #Sets a copy to be used when we do rolling averages
games_averages = games_averages.drop(columns = ['PLUS_MINUS', 'OPP_PLUS_MINUS'])
games_averages = games_averages.groupby('TEAM_ID').rolling(5).mean().reset_index(drop=True)

# Confirmation of rolling averages working
# games_averages.insert(0, 'ABRV', abrv)
# games_averages.insert(1, 'OPP_ABRV', opp_abrv)

# Dropping any games that became null
games_averages.insert(0, 'GAME_ID', game_ids)
games_averages.insert(1, 'SPREAD', spread)

games_averages = games_averages[games_averages.duplicated(subset = ['GAME_ID'], keep=False)]
games_averages.isna()
games_averages.dropna(inplace=True)

game_ids = games_averages['GAME_ID'].values
spread = games_averages['SPREAD'].values

games_averages = games_averages.drop(columns= ['GAME_ID', 'SPREAD', 'TEAM_ID'])
games_averages.reset_index(drop=True)
games = games_averages

# Print Head
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
games_averages.head(25)

Unnamed: 0,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,MIN,EFG%,OPP_EFG%,BLK%,OPP_BLK%,TOV%,OPP_TOV%,ORB%,OPP_ORB%,DREB%,OPP_DREB%,POSS,OPP_POSS,STL%,OPP_STL%,FTR,OPP_FTR,TS,OPP_TS,ASTR,OPP_ASTR,TRB%,OPP_TRB%,PACE,OPP_PACE,ORTG,OPP_ORTG,DRTG,OPP_DRTG
4,78.0,29.4,67.4,0.44,6.6,20.2,0.32,12.6,17.8,0.73,8.0,30.6,38.6,15.6,9.6,6.0,19.2,25.4,80.2,29.0,70.6,0.41,6.4,20.4,0.32,15.8,20.4,0.8,12.0,28.4,40.4,15.0,6.6,4.8,17.2,25.6,200.0,0.49,0.45,0.12,0.1,0.2,0.18,0.21,0.28,0.72,0.79,82.97,81.38,0.12,0.08,0.19,0.23,0.52,0.5,0.14,0.13,0.02,0.02,98.62,98.62,0.94,0.99,0.97,0.96
5,77.4,28.2,67.6,0.42,6.4,21.0,0.3,14.6,20.8,0.71,9.0,30.4,39.4,15.4,9.8,6.6,18.4,24.4,83.8,31.4,72.4,0.43,7.4,21.4,0.35,13.6,17.8,0.8,11.4,29.4,40.8,16.8,7.4,3.8,17.4,22.0,199.6,0.47,0.48,0.13,0.08,0.19,0.18,0.22,0.27,0.73,0.78,82.71,82.78,0.12,0.09,0.22,0.19,0.51,0.52,0.14,0.15,0.02,0.02,99.5,99.5,0.94,1.01,1.02,0.93
6,82.8,29.2,70.0,0.42,6.6,21.4,0.31,17.8,22.4,0.79,10.4,31.2,41.6,16.4,9.0,4.6,17.8,21.6,92.6,34.2,75.2,0.46,7.8,23.0,0.34,16.4,20.6,0.82,11.4,29.4,40.8,17.8,7.4,4.0,18.2,25.0,207.2,0.47,0.51,0.09,0.08,0.18,0.18,0.26,0.26,0.74,0.74,83.77,87.42,0.1,0.09,0.26,0.21,0.52,0.55,0.14,0.15,0.02,0.02,99.15,99.15,0.99,1.06,1.1,0.95
7,88.4,30.2,71.8,0.42,8.2,23.8,0.35,19.8,26.0,0.76,9.0,33.4,42.4,17.2,9.2,4.2,17.0,20.6,94.6,35.2,78.8,0.45,7.6,24.0,0.31,16.6,21.6,0.78,12.0,29.4,41.4,17.4,8.4,4.4,17.2,25.8,215.2,0.48,0.5,0.08,0.09,0.17,0.16,0.23,0.26,0.74,0.77,87.59,89.76,0.1,0.09,0.28,0.21,0.53,0.54,0.15,0.14,0.02,0.02,99.07,99.07,1.01,1.05,1.08,0.98
8,92.0,30.4,72.4,0.42,8.4,25.6,0.33,22.8,29.0,0.79,8.0,36.6,44.6,18.2,8.2,4.6,16.8,21.4,100.2,36.6,80.8,0.45,9.4,25.4,0.37,17.6,23.4,0.75,11.2,31.0,42.2,19.4,8.6,5.6,16.4,27.2,223.4,0.48,0.51,0.09,0.12,0.17,0.15,0.2,0.23,0.77,0.8,90.2,92.44,0.09,0.09,0.31,0.22,0.54,0.55,0.15,0.15,0.02,0.02,98.19,98.19,1.02,1.08,1.11,0.99
9,92.0,30.4,76.4,0.4,8.6,27.0,0.32,22.6,28.4,0.8,8.0,38.4,46.4,18.4,8.8,5.0,16.4,21.2,102.4,37.4,83.0,0.45,9.2,24.2,0.38,18.4,25.0,0.74,10.0,33.6,43.6,20.8,8.8,5.6,15.4,24.6,231.6,0.45,0.51,0.09,0.11,0.16,0.14,0.19,0.2,0.8,0.81,93.4,95.42,0.09,0.09,0.3,0.22,0.52,0.55,0.15,0.16,0.02,0.02,97.91,97.91,0.98,1.07,1.1,0.96
10,96.8,32.4,77.2,0.42,8.8,26.2,0.33,23.2,28.2,0.83,6.8,40.2,47.0,20.6,8.2,5.4,17.2,21.2,102.2,36.6,85.4,0.43,8.6,23.8,0.36,20.4,27.2,0.75,10.2,33.6,43.8,20.6,9.6,6.2,14.4,23.6,239.8,0.48,0.48,0.09,0.12,0.16,0.13,0.17,0.2,0.8,0.83,96.01,97.51,0.08,0.1,0.3,0.24,0.54,0.52,0.16,0.16,0.02,0.02,96.84,96.84,1.01,1.05,1.07,0.99
11,93.4,32.0,78.6,0.41,9.0,26.6,0.34,20.4,25.6,0.79,6.8,41.2,48.0,21.4,8.2,7.0,16.8,21.8,99.2,37.0,86.6,0.43,9.0,24.0,0.37,16.2,22.6,0.72,9.8,35.2,45.0,22.4,10.2,7.2,13.8,22.0,240.2,0.47,0.48,0.11,0.14,0.16,0.12,0.16,0.19,0.81,0.84,95.87,96.52,0.08,0.11,0.26,0.19,0.52,0.51,0.17,0.17,0.02,0.02,96.11,96.11,0.97,1.03,1.03,0.97
12,90.2,32.0,79.8,0.4,8.2,25.2,0.33,18.0,21.8,0.82,7.8,38.2,46.0,20.6,8.2,6.8,17.8,20.2,95.4,35.0,84.0,0.42,8.6,25.0,0.35,16.8,23.2,0.73,9.4,35.6,45.0,22.8,9.4,7.4,15.2,21.0,240.2,0.45,0.47,0.11,0.14,0.17,0.14,0.18,0.2,0.8,0.82,95.42,96.01,0.09,0.1,0.23,0.2,0.51,0.51,0.16,0.17,0.02,0.02,95.63,95.63,0.94,0.99,1.0,0.94
13,89.0,33.0,80.6,0.41,7.6,24.4,0.31,15.4,19.0,0.8,8.6,35.8,44.4,20.2,8.0,6.8,17.6,20.0,94.8,34.4,86.2,0.4,8.4,25.6,0.33,17.6,24.0,0.74,12.6,36.2,48.8,21.8,8.0,6.2,15.4,18.8,240.0,0.46,0.45,0.11,0.11,0.17,0.14,0.19,0.25,0.75,0.81,94.04,95.58,0.08,0.09,0.19,0.2,0.5,0.49,0.16,0.16,0.02,0.02,94.81,94.81,0.95,0.99,1.01,0.93


In [7]:
# Z score normalization
realcols = list(games.columns.values)
for col in realcols:
   mean = games[col].mean()
   std = games[col].std()
   games[col] = (games[col] - mean)/std

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
games.head(25)

Unnamed: 0,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,MIN,EFG%,OPP_EFG%,BLK%,OPP_BLK%,TOV%,OPP_TOV%,ORB%,OPP_ORB%,DREB%,OPP_DREB%,POSS,OPP_POSS,STL%,OPP_STL%,FTR,OPP_FTR,TS,OPP_TS,ASTR,OPP_ASTR,TRB%,OPP_TRB%,PACE,OPP_PACE,ORTG,OPP_ORTG,DRTG,OPP_DRTG
4,-3.19,-2.78,-3.73,-0.63,-1.51,-1.55,-0.79,-1.51,-1.25,-0.6,-1.07,-1.06,-1.48,-2.15,1.19,0.87,2.43,1.94,-3.24,-3.16,-3.32,-1.76,-1.87,-1.89,-0.81,-0.57,-0.67,0.79,1.14,-2.02,-1.13,-2.72,-0.86,-0.06,1.67,2.27,-4.71,-0.88,-1.91,1.24,0.62,3.95,2.79,-0.4,1.5,-1.5,0.4,-2.84,-3.55,2.19,-0.18,-0.37,0.61,-1.08,-1.63,-1.54,-2.29,0.17,1.07,0.77,0.77,-2.38,-1.97,-1.89,-1.93
5,-3.26,-3.12,-3.69,-1.24,-1.59,-1.43,-1.16,-0.88,-0.5,-1.11,-0.55,-1.13,-1.26,-2.21,1.32,1.33,2.07,1.55,-2.8,-2.41,-2.93,-0.88,-1.43,-1.69,-0.18,-1.34,-1.41,0.69,0.79,-1.65,-1.0,-2.11,-0.27,-0.95,1.77,0.68,-4.76,-1.41,-1.03,1.7,-0.25,3.46,2.81,-0.11,1.14,-1.14,0.11,-2.9,-3.2,2.23,0.5,0.32,-0.43,-1.45,-1.0,-1.66,-1.51,0.42,1.14,1.05,1.05,-2.45,-1.55,-1.26,-2.21
6,-2.65,-2.84,-3.21,-1.27,-1.51,-1.36,-1.07,0.12,-0.09,0.47,0.17,-0.87,-0.65,-1.92,0.8,-0.2,1.8,0.45,-1.71,-1.55,-2.33,-0.02,-1.26,-1.38,-0.22,-0.35,-0.62,1.12,0.79,-1.65,-1.0,-1.77,-0.27,-0.77,2.17,2.0,-3.85,-1.44,-0.3,0.11,-0.21,2.9,2.84,0.87,1.0,-1.0,-0.87,-2.66,-2.07,1.37,0.42,1.26,0.21,-1.06,-0.14,-1.41,-1.33,0.81,0.56,0.94,0.94,-1.75,-0.83,-0.11,-2.05
7,-2.03,-2.56,-2.85,-1.17,-0.92,-0.98,-0.11,0.75,0.81,-0.15,-0.55,-0.16,-0.43,-1.68,0.93,-0.51,1.44,0.05,-1.46,-1.24,-1.56,-0.32,-1.34,-1.18,-1.0,-0.28,-0.33,0.25,1.14,-1.65,-0.81,-1.91,0.48,-0.41,1.67,2.36,-2.89,-1.08,-0.65,-0.29,0.2,2.25,2.11,0.12,0.99,-0.99,-0.12,-1.82,-1.49,1.37,0.91,1.78,0.11,-0.71,-0.58,-1.3,-1.75,0.64,0.34,0.91,0.91,-1.47,-0.94,-0.45,-1.59
8,-1.62,-2.5,-2.73,-1.19,-0.84,-0.7,-0.57,1.7,1.57,0.4,-1.07,0.88,0.18,-1.4,0.29,-0.2,1.35,0.37,-0.77,-0.8,-1.13,-0.08,-0.55,-0.91,0.35,0.07,0.18,-0.25,0.67,-1.06,-0.56,-1.23,0.62,0.66,1.28,2.97,-1.92,-1.07,-0.16,-0.04,1.36,2.02,1.53,-0.74,0.15,-0.15,0.74,-1.25,-0.84,0.46,0.82,2.7,0.34,-0.44,-0.09,-1.09,-1.15,0.9,0.07,0.64,0.64,-1.31,-0.45,-0.03,-1.47
9,-1.62,-2.5,-1.93,-1.97,-0.77,-0.48,-0.85,1.63,1.42,0.6,-1.07,1.46,0.67,-1.34,0.67,0.11,1.17,0.29,-0.5,-0.56,-0.66,-0.12,-0.64,-1.14,0.58,0.35,0.63,-0.55,-0.04,-0.1,-0.11,-0.76,0.77,0.66,0.78,1.83,-0.94,-1.76,-0.28,-0.02,1.03,1.55,0.85,-1.02,-0.67,0.67,1.02,-0.54,-0.11,0.69,0.88,2.26,0.43,-1.14,-0.23,-1.23,-0.69,1.0,-0.16,0.55,0.55,-1.84,-0.57,-0.19,-1.87
10,-1.08,-1.93,-1.77,-1.18,-0.69,-0.61,-0.48,1.82,1.37,1.23,-1.69,2.05,0.84,-0.7,0.29,0.41,1.53,0.29,-0.53,-0.8,-0.15,-1.03,-0.9,-1.22,0.17,1.05,1.26,-0.32,0.07,-0.1,-0.05,-0.82,1.37,1.2,0.29,1.38,0.04,-1.09,-1.18,-0.05,1.46,1.8,0.11,-1.6,-0.78,0.78,1.6,0.03,0.4,0.17,1.27,2.39,0.99,-0.4,-0.98,-0.53,-0.97,0.81,-0.45,0.21,0.21,-1.45,-0.99,-0.63,-1.46
11,-1.47,-2.05,-1.5,-1.6,-0.62,-0.54,-0.4,0.94,0.71,0.54,-1.69,2.37,1.12,-0.47,0.29,1.63,1.35,0.53,-0.9,-0.68,0.11,-1.08,-0.73,-1.18,0.47,-0.42,-0.05,-0.95,-0.16,0.48,0.33,-0.21,1.81,2.09,-0.01,0.68,0.09,-1.42,-1.17,0.95,2.17,1.62,-0.07,-1.85,-1.09,1.09,1.85,0.0,0.16,0.21,1.73,1.41,-0.45,-1.01,-1.33,-0.25,-0.23,0.97,-0.19,-0.02,-0.02,-1.95,-1.3,-1.05,-1.77
12,-1.82,-2.05,-1.26,-1.83,-0.92,-0.76,-0.66,0.19,-0.24,0.92,-1.17,1.4,0.56,-0.7,0.29,1.48,1.8,-0.1,-1.36,-1.3,-0.45,-1.49,-0.9,-0.99,-0.19,-0.21,0.12,-0.77,-0.4,0.63,0.33,-0.08,1.22,2.27,0.68,0.24,0.09,-1.78,-1.53,0.99,2.09,2.05,0.71,-1.35,-0.83,0.83,1.35,-0.1,0.03,0.24,1.19,0.54,-0.12,-1.46,-1.59,-0.55,0.0,0.41,0.02,-0.17,-0.17,-2.36,-1.86,-1.52,-2.13
13,-1.96,-1.76,-1.1,-1.55,-1.14,-0.89,-0.93,-0.63,-0.95,0.69,-0.76,0.62,0.12,-0.82,0.16,1.48,1.71,-0.18,-1.44,-1.49,0.02,-2.13,-0.99,-0.87,-0.6,0.07,0.35,-0.59,1.5,0.85,1.54,-0.42,0.18,1.2,0.78,-0.74,0.07,-1.67,-2.13,0.93,0.9,2.0,0.64,-0.92,0.82,-0.82,0.92,-0.4,-0.07,0.14,0.28,-0.32,-0.01,-1.58,-2.13,-0.64,-0.55,-0.34,1.44,-0.43,-0.43,-2.33,-1.88,-1.39,-2.23


In [8]:
# Test/Train splitting
from sklearn.metrics import r2_score

spread = spread.astype(int)
y = spread
x = games.values
print(x.shape)
print(y.shape)

xtrain, xtest, ytrain, ytest =  train_test_split(x,y, test_size = 0.2, random_state=1234)

(17786, 65)
(17786,)


In [9]:
# Ridge Model
ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
ridge.fit(xtrain, ytrain)

predict = ridge.predict(xtest)

#stats for the model
pd.set_option('display.max_rows', None)
#print(pd.Series(ridge.coef_, index = games.columns[0:66])) 
print(predict)
print(ytest)
mse = mean_squared_error(ytest, predict) 
print("Test mean squared error (MSE): {:.2f}".format(mse))
print("Score:", ridge.score(xtest,ytest))

[-0.73994768  1.0503692   5.33641576 ... 14.6358274   1.02077861
 -1.35824406]
[-15  -5  14 ...   9  25  10]
Test mean squared error (MSE): 160.35
Score: 0.21830038790883977


In [10]:
# LASSO
lasso = LassoCV(cv=5, random_state=0)
lasso.fit(xtrain, ytrain)

predict3 = lasso.predict(xtest)

#stats for the model
pd.set_option('display.max_rows', None)
#print(pd.Series(lasso.coef_, index = games.columns[0:66])) 
mse = mean_squared_error(ytest, predict3) 
print("Test mean squared error (MSE): {:.2f}".format(mse))
print("Score:", lasso.score(xtest,ytest))

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Test mean squared error (MSE): 160.61
Score: 0.2170338081557306


In [11]:
# Support Vector Machine
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svc = SVC()

# fit classifier to training set
svc.fit(xtrain, ytrain)

# make predictions on test set
y_pred = svc.predict(xtest)

# Calculate MSE
print('Model MSE: {0:0.4f}'.format(mean_squared_error(ytest, y_pred)))

# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'.format(svc.score(xtest, ytest)))

Model MSE: 188.0843
Model accuracy score with default hyperparameters: 0.0346


In [12]:
# Support Vector Machine with high margin
svc = SVC(C = 100, kernel = 'rbf')

# fit classifier to training set
svc.fit(xtrain, ytrain)


# make predictions on test set
y_pred_high = svc.predict(xtest)

# Calculate MSE
print('Model MSE: {0:0.4f}'.format(mean_squared_error(ytest, y_pred_high)))

# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'.format(svc.score(xtest, ytest)))

Model MSE: 306.7364
Model accuracy score with rbf kernel and C=100.0 : 0.0278
