In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error


from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams

# Gets the box score of every single game (NBA + WNBA + others: nba.com doesn't seperate the leagues) 
# from 2014-2021 into a dataframe.
all_games_finder = leaguegamefinder.LeagueGameFinder()
all_games = all_games_finder.get_data_frames()[0]

# Creates empty dataframe with the same column names passed in from the dataframe containing all the games.
column_names = all_games.columns
games = pd.DataFrame(columns = column_names)

# Appends every game containing an NBA team in the all_games df to the games df. 
nba_teams = teams.get_teams()
for team in nba_teams:
    temp_id = team['id']
    games = games.append(all_games[all_games['TEAM_ID'] == temp_id]).reset_index(drop = True)

pd.set_option('display.max_columns', None)
games.head(60)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22021,1610612737,ATL,Atlanta Hawks,22100359,2021-12-06,ATL @ MIN,W,240,121,40,90,0.444,25,49,0.51,16,19,0.842,11,38,49,31,5,8,10,20,11.0
1,22021,1610612737,ATL,Atlanta Hawks,22100350,2021-12-05,ATL vs. CHA,L,240,127,48,93,0.516,17,37,0.459,14,18,0.778,10,35,45,29,4,3,10,19,-3.0
2,22021,1610612737,ATL,Atlanta Hawks,22100335,2021-12-03,ATL vs. PHI,L,240,96,31,76,0.408,10,28,0.357,24,30,0.8,11,36,47,20,5,3,14,20,-2.0
3,22021,1610612737,ATL,Atlanta Hawks,22100319,2021-12-01,ATL @ IND,W,240,114,44,86,0.512,16,33,0.485,10,12,0.833,7,34,41,24,4,8,13,15,3.0
4,22021,1610612737,ATL,Atlanta Hawks,22100293,2021-11-27,ATL vs. NYK,L,240,90,33,93,0.355,9,37,0.243,15,20,0.75,13,39,52,18,8,6,6,17,-9.0
5,22021,1610612737,ATL,Atlanta Hawks,22100285,2021-11-26,ATL @ MEM,W,239,132,52,89,0.584,13,27,0.481,15,21,0.714,9,40,49,33,8,5,12,15,32.0
6,22021,1610612737,ATL,Atlanta Hawks,22100277,2021-11-24,ATL @ SAS,W,239,124,45,88,0.511,12,26,0.462,22,24,0.917,8,36,44,26,10,5,9,11,18.0
7,22021,1610612737,ATL,Atlanta Hawks,22100255,2021-11-22,ATL vs. OKC,W,239,113,42,87,0.483,14,34,0.412,15,16,0.938,8,36,44,25,6,6,7,16,12.0
8,22021,1610612737,ATL,Atlanta Hawks,22100242,2021-11-20,ATL vs. CHA,W,241,115,43,82,0.524,12,34,0.353,17,21,0.81,8,38,46,24,6,6,12,22,10.0
9,22021,1610612737,ATL,Atlanta Hawks,22100215,2021-11-17,ATL vs. BOS,W,240,110,41,81,0.506,13,37,0.351,15,18,0.833,6,34,40,28,9,4,11,17,11.0


In [2]:
# To-do: get rid of summer league/playoff games


# Dropping any game (two rows in DF) that has any NaN values or is missing either team's stats
games.isna()
games.dropna(inplace=True)

games = games[games.duplicated(subset = ['GAME_ID'], keep=False)]

# Merging games together (previously seperated in the DF by team: each team's stats from the game were kept in seperate rows
games = games.sort_values(by=['GAME_DATE'])
games = games.reset_index(drop=True)

# Team A and B each have a row for their stats in a given matchup; we need to add both stats to the end of their respective rows
# Team A dataframe
tempA = games[games.index % 2 == 0]
tempA2 = games[games.index % 2 == 1]

tempA2 = tempA2.add_prefix('OPP_')

tempA = tempA.reset_index(drop=True)
tempA2 = tempA2.reset_index(drop=True)

a_temp = tempA.join(tempA2)

# Team B dataframe
tempB = games[games.index % 2 == 0]
tempB2 = games[games.index % 2 == 1]

tempB = tempB.add_prefix('OPP_')

tempB = tempB.reset_index(drop=True)
tempB2 = tempB2.reset_index(drop=True)

b_temp = tempB2.join(tempB)

# Adding both teams to main dataframe
games = a_temp.append(b_temp)

# Resorting main dataframe
games = games.sort_values(by=['GAME_DATE'])
games = games.reset_index(drop=True)

# Sending data to CSV
games.to_csv('games.csv', index = False)



# Print Head
pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_SEASON_ID,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,OPP_TEAM_NAME,OPP_GAME_ID,OPP_GAME_DATE,OPP_MATCHUP,OPP_WL,OPP_MIN,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS
0,22015,1610612760,OKC,Oklahoma City Thunder,1421500020,2015-07-09,OKC @ MEM,L,200,81,33,75,0.44,5,20,0.25,10,19,0.526,15,15,30,15,11,5,15,33,-6.0,22015,1610612763,MEM,Memphis Grizzlies,1421500020,2015-07-09,MEM vs. OKC,W,198,87,28,61,0.459,1,7,0.143,30,33,0.909,12,25,37,13,9,2,19,21,6.0
1,22015,1610612755,PHI,Philadelphia 76ers,1621500006,2015-07-09,PHI @ UTA,L,209,78,27,61,0.443,5,21,0.238,19,27,0.704,8,33,41,8,5,6,25,31,-6.0,22015,1610612762,UTA,Utah Jazz,1621500006,2015-07-09,UTA vs. PHI,W,211,84,27,74,0.365,2,12,0.167,28,35,0.8,10,23,33,14,13,2,12,24,6.0
2,22015,1610612759,SAS,San Antonio Spurs,1621500005,2015-07-09,SAS vs. BOS,L,201,71,26,65,0.4,5,19,0.263,14,19,0.737,11,27,38,11,7,4,16,21,-14.0,22015,1610612738,BOS,Boston Celtics,1621500005,2015-07-09,BOS @ SAS,W,202,85,29,65,0.446,12,20,0.6,15,17,0.882,5,29,34,21,7,2,10,12,14.0
3,22015,1610612763,MEM,Memphis Grizzlies,1421500020,2015-07-09,MEM vs. OKC,W,198,87,28,61,0.459,1,7,0.143,30,33,0.909,12,25,37,13,9,2,19,21,6.0,22015,1610612760,OKC,Oklahoma City Thunder,1421500020,2015-07-09,OKC @ MEM,L,200,81,33,75,0.44,5,20,0.25,10,19,0.526,15,15,30,15,11,5,15,33,-6.0
4,22015,1610612738,BOS,Boston Celtics,1621500005,2015-07-09,BOS @ SAS,W,202,85,29,65,0.446,12,20,0.6,15,17,0.882,5,29,34,21,7,2,10,12,14.0,22015,1610612759,SAS,San Antonio Spurs,1621500005,2015-07-09,SAS vs. BOS,L,201,71,26,65,0.4,5,19,0.263,14,19,0.737,11,27,38,11,7,4,16,21,-14.0


In [3]:
# Sorting the DF by teams
temp_games = pd.DataFrame()
nba_teams = teams.get_teams()
for team in nba_teams:
    temp_id = team['id']
    temp_games = temp_games.append(games[games['TEAM_ID'] == temp_id]).reset_index(drop=True)

games = temp_games

# Print Head
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
games.head(100)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_SEASON_ID,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,OPP_TEAM_NAME,OPP_GAME_ID,OPP_GAME_DATE,OPP_MATCHUP,OPP_WL,OPP_MIN,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS
0,22015,1610612737,ATL,Atlanta Hawks,1521500003,2015-07-10,ATL @ DEN,L,201,71,29,70,0.414,6,22,0.273,7,8,0.875,9,28,37,13,8,4,22,29,-15.0,22015,1610612758,SAC,Sacramento Kings,1521500002,2015-07-10,SAC @ TOR,L,200,68,23,62,0.371,5,17,0.294,17,22,0.773,9,30,39,14,9,8,22,23,-22.0
1,22015,1610612737,ATL,Atlanta Hawks,1521500020,2015-07-12,ATL vs. GSW,W,200,71,28,63,0.444,6,22,0.273,9,22,0.409,2,33,35,12,12,12,20,33,1.0,22015,1610612741,CHI,Chicago Bulls,1521500017,2015-07-12,CHI @ TOR,L,201,66,22,70,0.314,3,25,0.12,19,28,0.679,10,28,38,12,5,2,14,23,-15.0
2,22015,1610612737,ATL,Atlanta Hawks,1521500039,2015-07-15,ATL vs. MIA,W,200,75,28,65,0.431,3,14,0.214,16,18,0.889,12,32,44,17,7,7,20,28,11.0,22015,1610612764,WAS,Washington Wizards,1521500041,2015-07-15,WAS @ UTA,W,201,86,30,70,0.429,5,20,0.25,21,27,0.778,8,33,41,18,8,2,16,29,8.0
3,22015,1610612737,ATL,Atlanta Hawks,1521500050,2015-07-16,ATL @ DEN,W,199,82,31,75,0.413,10,22,0.455,10,14,0.714,8,29,37,19,15,2,17,22,9.0,22015,1610612756,PHX,Phoenix Suns,1521500045,2015-07-16,PHX vs. MIL,W,199,106,40,75,0.533,7,13,0.538,19,22,0.864,11,28,39,24,8,9,13,21,26.0
4,22015,1610612737,ATL,Atlanta Hawks,1521500062,2015-07-18,ATL vs. DAL,W,200,91,31,64,0.484,8,21,0.381,21,27,0.778,9,31,40,17,6,5,17,15,8.0,22015,1610612738,BOS,Boston Celtics,1521500061,2015-07-18,BOS vs. SAS,L,200,93,33,77,0.429,8,28,0.286,19,28,0.679,9,25,34,17,6,7,8,20,-2.0
5,22015,1610612737,ATL,Atlanta Hawks,1521500065,2015-07-19,ATL @ SAS,L,199,68,23,71,0.324,5,26,0.192,17,23,0.739,14,27,41,12,9,7,18,24,-7.0,22015,1610612740,NOP,New Orleans Pelicans,1521500066,2015-07-19,NOP vs. PHX,L,200,87,30,70,0.429,8,23,0.348,19,24,0.792,8,22,30,16,6,4,13,17,-6.0
6,12015,1610612737,ATL,Atlanta Hawks,11500017,2015-10-07,ATL @ CLE,W,238,98,33,75,0.44,7,24,0.292,25,30,0.833,9,37,46,17,8,2,17,19,2.0,12015,1610612760,OKC,Oklahoma City Thunder,11500018,2015-10-07,OKC @ MIN,W,239,122,50,90,0.556,11,22,0.5,11,15,0.733,14,31,45,34,6,7,17,21,23.0
7,12015,1610612737,ATL,Atlanta Hawks,11500032,2015-10-09,ATL @ NOP,W,240,103,33,74,0.446,11,26,0.423,26,36,0.722,5,43,48,21,8,5,16,23,10.0,12015,1610612756,PHX,Phoenix Suns,11500035,2015-10-09,PHX vs. UTA,W,241,101,41,92,0.446,7,21,0.333,12,17,0.706,12,32,44,20,11,6,9,24,16.0
8,12015,1610612737,ATL,Atlanta Hawks,11500060,2015-10-14,ATL vs. SAS,W,240,100,32,78,0.41,11,31,0.355,25,29,0.862,3,45,48,24,10,4,16,26,14.0,12015,1610612766,CHA,Charlotte Hornets,11500061,2015-10-14,CHA @ LAC,W,240,113,35,78,0.449,14,25,0.56,29,39,0.744,6,49,55,25,10,4,17,26,42.0
9,12015,1610612737,ATL,Atlanta Hawks,11500068,2015-10-16,ATL @ DAL,W,241,91,31,84,0.369,9,28,0.321,20,24,0.833,9,40,49,18,9,7,15,14,7.0,12015,1610612752,NYK,New York Knicks,11500067,2015-10-16,NYK vs. BOS,W,241,101,37,100,0.37,12,25,0.48,15,19,0.789,18,34,52,24,12,7,7,25,6.0


In [4]:
# Storing some columns for future use
game_ids = games['GAME_ID'].values
team_ids = games['TEAM_ID'].values
minutes = games['MIN'].values
abrv = games['TEAM_ABBREVIATION'].values
opp_abrv = games['OPP_TEAM_ABBREVIATION'].values
spread = games['PLUS_MINUS'].values

# Dropping Non-essential categorical data
games = games.drop(columns=['SEASON_ID', 'OPP_SEASON_ID', 'OPP_TEAM_ID','GAME_ID', 'OPP_GAME_ID', 'TEAM_ABBREVIATION', 'OPP_TEAM_ABBREVIATION', 'TEAM_NAME', 'OPP_TEAM_NAME', 'MATCHUP', 'OPP_MATCHUP', 'WL', 'OPP_WL', 'GAME_DATE', 'OPP_GAME_DATE', 'MIN', 'OPP_MIN']) 

# Adding minutes back as a single column
games['MIN'] = minutes

# Print Head
pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,TEAM_ID,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS,MIN
0,1610612737,71,29,70,0.414,6,22,0.273,7,8,0.875,9,28,37,13,8,4,22,29,-15.0,68,23,62,0.371,5,17,0.294,17,22,0.773,9,30,39,14,9,8,22,23,-22.0,201
1,1610612737,71,28,63,0.444,6,22,0.273,9,22,0.409,2,33,35,12,12,12,20,33,1.0,66,22,70,0.314,3,25,0.12,19,28,0.679,10,28,38,12,5,2,14,23,-15.0,200
2,1610612737,75,28,65,0.431,3,14,0.214,16,18,0.889,12,32,44,17,7,7,20,28,11.0,86,30,70,0.429,5,20,0.25,21,27,0.778,8,33,41,18,8,2,16,29,8.0,200
3,1610612737,82,31,75,0.413,10,22,0.455,10,14,0.714,8,29,37,19,15,2,17,22,9.0,106,40,75,0.533,7,13,0.538,19,22,0.864,11,28,39,24,8,9,13,21,26.0,199
4,1610612737,91,31,64,0.484,8,21,0.381,21,27,0.778,9,31,40,17,6,5,17,15,8.0,93,33,77,0.429,8,28,0.286,19,28,0.679,9,25,34,17,6,7,8,20,-2.0,200


In [5]:
# Adding Advanced Stats to enhance model performance, formulas were gathered from various sources. 

# Efficient Field Goal Percentage
games['EFG%'] = (games['FGM'] + (.5 * games['FG3M'])) / games['FGA']
games['OPP_EFG%'] = (games['OPP_FGM'] + (.5 * games['OPP_FG3M'])) / games['OPP_FGA']

# Block Percentage
games['BLK%'] = (games['BLK'] / (games['OPP_FGA']-games['OPP_FG3A']))
games['OPP_BLK%'] = (games['OPP_BLK'] / (games['FGA']-games['FG3A']))

# Turnover Percentage
games['TOV%'] = games['TOV'] / (games['FGA'] + 0.44 * games['FTA'] + games['TOV'])
games['OPP_TOV%'] = games['OPP_TOV'] / (games['OPP_FGA'] + 0.44 * games['OPP_FTA'] + games['OPP_TOV'])

#Offensive Rebound Percentage
games['ORB%'] = games['OREB'] / (games['OREB'] + games['OPP_DREB'])
games['OPP_ORB%'] = games['OPP_OREB'] / (games['OPP_OREB'] + games['DREB'])

#Defensive Rebound Percentage
games['DREB%'] = games['DREB'] / (games['OPP_OREB'] + games['DREB'])
games['OPP_DREB%'] = games['OPP_DREB'] / (games['OREB'] + games['OPP_DREB'])

# Possessions
#games["POSS"] = 0.5*((games["FGA"] + 0.4*games["FTA"] - 1.07*(games["OREB"]/(games["OREB"] + games["OREB_other"])) * (games["FGA"] - games["FGM"]) + games["TOV"]) + games["FGA_other"] + 0.4*games["FTA_other"] - 1.07 * (games["OREB_other"] / (games["OREB_other"] + games["DREB"])) * (games["FGA_other"] - games["FGM_other]) + games["TOV_other"]))

# ***Placeholder POSS***
games['POSS'] = 0.96*((games['FGA']) + games['TOV'] + 0.44 * games['FTA'] - games['OREB'])
games['OPP_POSS'] = 0.96*((games['OPP_FGA']) + games['OPP_TOV'] + 0.44 * games['OPP_FTA'] - games['OPP_OREB'])

# Steals Percentage
games['STL%'] = (games['STL'] / games['OPP_POSS'])
games['OPP_STL%'] = (games['OPP_STL'] / games['POSS'])

# Free Throw Rate
games['FTR'] = games['FTM'] / games['FGA']
games['OPP_FTR'] = games['OPP_FTM'] / games['OPP_FGA']

# True Shooting (Requires True Shooting Attempts)
tsa = games['FGA'] + 0.44 * games['FTA']
OPP_tsa = games['OPP_FGA'] + 0.44 * games['OPP_FTA']
games['TS'] = games['PTS'] / (2 * tsa)
games['OPP_TS'] = games['OPP_PTS'] / (2 * OPP_tsa)

# Assist Rate
games['ASTR'] = games['AST'] / (games['FGA'] + (.44 * games['FTA']) + games['AST'] + games['TOV'])
games['OPP_ASTR'] = games['OPP_AST'] / (games['OPP_FGA'] + (.44 * games['OPP_FTA']) + games['OPP_AST'] + games['OPP_TOV'])

# Total Rebound Percentage
games['TRB%'] = (games['REB'] * (games['REB'] / 5)) / (games['MIN'] * (games['REB'] + games['OPP_REB']))
games['OPP_TRB%'] = (games['OPP_REB'] * (games['OPP_REB'] / 5)) / (games['MIN'] * (games['OPP_REB'] + games['REB']))

# PACE
games['PACE'] = 48 * (games['POSS'] + games['OPP_POSS']) / (2 * (games['MIN'] / 5))
games['OPP_PACE'] = 48 * (games['OPP_POSS'] + games['POSS']) / (2 * (games['MIN'] / 5))

# Offensive Rating
games['ORTG'] = (games['PTS'] / games['POSS'])
games['OPP_ORTG'] = (games['OPP_PTS'] / games['OPP_POSS'])

# Defensive Rating
games['DRTG'] = (games['OPP_PTS'] / games['POSS'])
games['OPP_DRTG'] = (games['PTS'] / games['OPP_POSS'])

pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,TEAM_ID,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS,MIN,EFG%,OPP_EFG%,BLK%,OPP_BLK%,TOV%,OPP_TOV%,ORB%,OPP_ORB%,DREB%,OPP_DREB%,POSS,OPP_POSS,STL%,OPP_STL%,FTR,OPP_FTR,TS,OPP_TS,ASTR,OPP_ASTR,TRB%,OPP_TRB%,PACE,OPP_PACE,ORTG,OPP_ORTG,DRTG,OPP_DRTG
0,1610612737,71,29,70,0.414,6,22,0.273,7,8,0.875,9,28,37,13,8,4,22,29,-15.0,68,23,62,0.371,5,17,0.294,17,22,0.773,9,30,39,14,9,8,22,23,-22.0,201,0.457143,0.41129,0.088889,0.166667,0.230318,0.234842,0.230769,0.243243,0.756757,0.769231,83.0592,81.2928,0.09841,0.108356,0.1,0.274194,0.482862,0.47433,0.119794,0.130015,0.017924,0.019914,98.120597,98.120597,0.854812,0.836482,0.818693,0.873386
1,1610612737,71,28,63,0.444,6,22,0.273,9,22,0.409,2,33,35,12,12,12,20,33,1.0,66,22,70,0.314,3,25,0.12,19,28,0.679,10,28,38,12,5,2,14,23,-15.0,200,0.492063,0.335714,0.266667,0.04878,0.215796,0.145349,0.066667,0.232558,0.767442,0.933333,87.0528,82.8672,0.14481,0.057436,0.142857,0.271429,0.488442,0.400875,0.114635,0.110783,0.016781,0.019781,101.952,101.952,0.815597,0.796455,0.758161,0.856793
2,1610612737,75,28,65,0.431,3,14,0.214,16,18,0.889,12,32,44,17,7,7,20,28,11.0,86,30,70,0.429,5,20,0.25,21,27,0.778,8,33,41,18,8,2,16,29,8.0,200,0.453846,0.464286,0.14,0.039216,0.215239,0.163465,0.266667,0.2,0.8,0.733333,77.6832,86.2848,0.081127,0.102982,0.246154,0.3,0.514262,0.525159,0.154658,0.155333,0.022776,0.019776,98.3808,98.3808,0.96546,0.996699,1.10706,0.869215
3,1610612737,82,31,75,0.413,10,22,0.455,10,14,0.714,8,29,37,19,15,2,17,22,9.0,106,40,75,0.533,7,13,0.538,19,22,0.864,11,28,39,24,8,9,13,21,26.0,199,0.48,0.58,0.032258,0.169811,0.173187,0.133088,0.222222,0.275,0.725,0.777778,86.5536,83.2128,0.180261,0.092428,0.133333,0.253333,0.505175,0.625886,0.162171,0.197239,0.018104,0.020114,102.371698,102.371698,0.94739,1.273842,1.224675,0.985425
4,1610612737,91,31,64,0.484,8,21,0.381,21,27,0.778,9,31,40,17,6,5,17,15,8.0,93,33,77,0.429,8,28,0.286,19,28,0.679,9,25,34,17,6,7,8,20,-2.0,200,0.546875,0.480519,0.102041,0.162791,0.183032,0.082203,0.264706,0.225,0.775,0.735294,80.5248,84.7872,0.070765,0.074511,0.328125,0.246753,0.599631,0.5206,0.154714,0.148705,0.021622,0.015622,99.1872,99.1872,1.130087,1.096864,1.154924,1.073275


In [6]:
# Getting rolling average of recent x games
games_averages = games.copy() #Sets a copy to be used when we do rolling averages
games_averages = games_averages.drop(columns = ['PLUS_MINUS', 'OPP_PLUS_MINUS'])
games_averages = games_averages.groupby('TEAM_ID').rolling(5).mean().reset_index(drop=True)

# Confirmation of rolling averages working
# games_averages.insert(0, 'ABRV', abrv)
# games_averages.insert(1, 'OPP_ABRV', opp_abrv)

# Dropping any games that became null
games_averages.insert(0, 'GAME_ID', game_ids)
games_averages.insert(1, 'SPREAD', spread)

games_averages = games_averages[games_averages.duplicated(subset = ['GAME_ID'], keep=False)]
games_averages.isna()
games_averages.dropna(inplace=True)

game_ids = games_averages['GAME_ID'].values
spread = games_averages['SPREAD'].values

games_averages = games_averages.drop(columns= ['GAME_ID', 'SPREAD', 'TEAM_ID'])
games_averages.reset_index(drop=True)
games = games_averages

# Print Head
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
games_averages.head(25)

Unnamed: 0,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,MIN,EFG%,OPP_EFG%,BLK%,OPP_BLK%,TOV%,OPP_TOV%,ORB%,OPP_ORB%,DREB%,OPP_DREB%,POSS,OPP_POSS,STL%,OPP_STL%,FTR,OPP_FTR,TS,OPP_TS,ASTR,OPP_ASTR,TRB%,OPP_TRB%,PACE,OPP_PACE,ORTG,OPP_ORTG,DRTG,OPP_DRTG
4,78.0,29.4,67.4,0.44,6.6,20.2,0.32,12.6,17.8,0.73,8.0,30.6,38.6,15.6,9.6,6.0,19.2,25.4,83.8,29.6,70.8,0.42,5.6,20.6,0.3,19.0,25.4,0.75,9.4,28.8,38.2,17.0,7.2,5.6,14.6,23.2,200.0,0.49,0.45,0.13,0.12,0.2,0.15,0.21,0.24,0.76,0.79,82.97,83.69,0.12,0.09,0.19,0.27,0.52,0.51,0.14,0.15,0.02,0.02,100.0,100.0,0.94,1.0,1.01,0.93
5,77.4,28.2,67.6,0.42,6.4,21.0,0.3,14.6,20.8,0.71,9.0,30.4,39.4,15.4,9.8,6.6,18.4,24.4,87.6,31.0,72.4,0.43,6.2,21.8,0.31,19.4,25.8,0.76,9.2,27.2,36.4,17.4,6.6,4.8,12.8,22.0,199.6,0.47,0.47,0.14,0.1,0.19,0.13,0.24,0.23,0.77,0.76,82.71,83.86,0.12,0.08,0.22,0.27,0.51,0.52,0.14,0.15,0.02,0.02,100.14,100.14,0.94,1.04,1.06,0.92
6,82.8,29.2,70.0,0.42,6.6,21.4,0.31,17.8,22.4,0.79,10.4,31.2,41.6,16.4,9.0,4.6,17.8,21.6,98.8,36.6,76.4,0.48,7.8,21.2,0.38,17.8,23.2,0.77,10.0,27.8,37.8,21.8,6.8,5.8,13.4,21.6,207.2,0.47,0.53,0.09,0.12,0.18,0.13,0.27,0.24,0.76,0.73,83.77,86.41,0.11,0.08,0.26,0.24,0.52,0.57,0.14,0.18,0.02,0.02,98.7,98.7,0.99,1.14,1.17,0.96
7,88.4,30.2,71.8,0.42,8.2,23.8,0.35,19.8,26.0,0.76,9.0,33.4,42.4,17.2,9.2,4.2,17.0,20.6,101.8,38.8,80.8,0.48,8.2,21.4,0.4,16.0,21.2,0.75,10.8,27.6,38.4,22.2,7.4,6.6,12.0,20.6,215.2,0.48,0.53,0.08,0.14,0.17,0.12,0.25,0.24,0.76,0.75,87.59,87.67,0.11,0.08,0.28,0.2,0.53,0.57,0.15,0.18,0.02,0.02,97.97,97.97,1.01,1.16,1.16,1.0
8,92.0,30.4,72.4,0.42,8.4,25.6,0.33,22.8,29.0,0.79,8.0,36.6,44.6,18.2,8.2,4.6,16.8,21.4,103.2,37.8,81.4,0.46,9.6,23.8,0.41,18.0,24.6,0.73,9.8,31.8,41.6,22.4,7.8,5.6,12.8,21.6,223.4,0.48,0.52,0.09,0.12,0.17,0.12,0.21,0.21,0.79,0.79,90.2,91.42,0.09,0.09,0.31,0.23,0.54,0.56,0.15,0.17,0.02,0.02,97.65,97.65,1.02,1.13,1.14,1.0
9,92.0,30.4,76.4,0.4,8.6,27.0,0.32,22.6,28.4,0.8,8.0,38.4,46.4,18.4,8.8,5.0,16.4,21.2,104.8,38.6,86.0,0.45,10.4,23.2,0.44,17.2,22.8,0.75,11.6,33.6,45.2,23.8,9.0,5.6,12.6,22.6,231.6,0.45,0.51,0.08,0.11,0.16,0.12,0.2,0.23,0.77,0.8,93.4,93.15,0.09,0.1,0.3,0.21,0.52,0.55,0.15,0.18,0.02,0.02,96.73,96.73,0.98,1.12,1.12,0.98
10,96.8,32.4,77.2,0.42,8.8,26.2,0.33,23.2,28.2,0.83,6.8,40.2,47.0,20.6,8.2,5.4,17.2,21.2,107.0,40.0,88.4,0.45,9.8,22.4,0.43,17.2,23.8,0.73,12.2,33.8,46.0,24.2,9.4,5.6,12.4,24.4,239.8,0.48,0.51,0.08,0.11,0.16,0.11,0.18,0.23,0.77,0.82,96.01,95.11,0.09,0.1,0.3,0.2,0.54,0.54,0.16,0.18,0.02,0.02,95.64,95.64,1.01,1.12,1.12,1.02
11,93.4,32.0,78.6,0.41,9.0,26.6,0.34,20.4,25.6,0.79,6.8,41.2,48.0,21.4,8.2,7.0,16.8,21.8,100.0,36.2,87.4,0.42,8.0,22.6,0.34,19.6,27.2,0.72,12.6,34.6,47.2,20.6,9.8,5.2,13.0,24.2,240.2,0.47,0.46,0.11,0.1,0.16,0.12,0.17,0.23,0.77,0.83,95.87,95.78,0.09,0.1,0.26,0.23,0.52,0.5,0.17,0.15,0.02,0.02,95.74,95.74,0.97,1.05,1.04,0.98
12,90.2,32.0,79.8,0.4,8.2,25.2,0.33,18.0,21.8,0.82,7.8,38.2,46.0,20.6,8.2,6.8,17.8,20.2,97.8,34.0,83.4,0.41,7.6,21.2,0.35,22.2,30.2,0.74,11.6,35.8,47.4,19.6,8.4,5.4,14.4,23.8,240.2,0.45,0.46,0.11,0.1,0.17,0.13,0.19,0.23,0.77,0.81,95.42,95.51,0.09,0.09,0.23,0.27,0.51,0.51,0.16,0.15,0.02,0.02,95.38,95.38,0.94,1.02,1.02,0.95
13,89.0,33.0,80.6,0.41,7.6,24.4,0.31,15.4,19.0,0.8,8.6,35.8,44.4,20.2,8.0,6.8,17.6,20.0,96.4,34.4,87.0,0.4,7.2,22.0,0.32,20.4,27.6,0.74,15.0,33.2,48.2,19.2,7.4,5.2,14.0,21.6,240.0,0.46,0.44,0.11,0.09,0.17,0.12,0.21,0.29,0.71,0.79,94.04,94.22,0.08,0.08,0.19,0.24,0.5,0.49,0.16,0.14,0.02,0.02,94.13,94.13,0.95,1.02,1.02,0.95


In [7]:
# Z score normalization
#realcols = list(games.columns.values)
#for col in realcols:
#   mean = games[col].mean()
#   std = games[col].std()
#   games[col] = (games[col] - mean)/std

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
#games.head(25)

In [8]:
# Test/Train splitting
from sklearn.metrics import r2_score

spread = spread.astype(int)
y = spread
x = games.values
print(x.shape)
print(y.shape)

xtrain, xtest, ytrain, ytest =  train_test_split(x,y, test_size = 0.2, random_state=1234)

(17826, 65)
(17826,)


In [9]:
# Ridge Model
ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
ridge.fit(xtrain, ytrain)

predict = ridge.predict(xtest)

#stats for the model
pd.set_option('display.max_rows', None)
#print(pd.Series(ridge.coef_, index = games.columns[0:66])) 
print(predict)
print(ytest)
mse = mean_squared_error(ytest, predict) 
print("Test mean squared error (MSE): {:.2f}".format(mse))
print("Score:", ridge.score(xtest,ytest))

[-11.43738602  -6.69135773 -10.83866517 ...  -6.64777685  -2.25611331
  14.3335559 ]
[  4   6 -23 ...  -7   5   7]
Test mean squared error (MSE): 158.94
Score: 0.21116780501219945


In [10]:
# LASSO
lasso = LassoCV(cv=5, random_state=0)
lasso.fit(xtrain, ytrain)

predict3 = lasso.predict(xtest)

#stats for the model
pd.set_option('display.max_rows', None)
#print(pd.Series(lasso.coef_, index = games.columns[0:66])) 
mse = mean_squared_error(ytest, predict3) 
print("Test mean squared error (MSE): {:.2f}".format(mse))
print("Score:", lasso.score(xtest,ytest))

Test mean squared error (MSE): 158.73
Score: 0.2121835444525485


In [11]:
# ELASTIC
from sklearn.linear_model import ElasticNet

alphas = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for a in alphas:
    elastic = ElasticNet(alpha = a).fit(xtrain, ytrain)
    y_pred_elastic = elastic.predict(xtest)
    mse = mean_squared_error(ytest, y_pred_elastic) 
    print("Alpha:{0:.4f}, Score:{1:.4f}, MSE:{2:.2f}"
       .format(a, elastic.score(xtest,ytest), mse))

  model = cd_fast.enet_coordinate_descent(


Alpha:0.0010, Score:0.2118, MSE:158.80


  model = cd_fast.enet_coordinate_descent(


Alpha:0.0100, Score:0.2120, MSE:158.77
Alpha:0.1000, Score:0.2121, MSE:158.74
Alpha:0.2000, Score:0.2120, MSE:158.77
Alpha:0.3000, Score:0.2116, MSE:158.84
Alpha:0.4000, Score:0.2111, MSE:158.95
Alpha:0.5000, Score:0.2104, MSE:159.09
Alpha:0.6000, Score:0.2095, MSE:159.26
Alpha:0.7000, Score:0.2085, MSE:159.46
Alpha:0.8000, Score:0.2074, MSE:159.69
Alpha:0.9000, Score:0.2062, MSE:159.94
Alpha:1.0000, Score:0.2049, MSE:160.20


In [None]:
# Support Vector Machine
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svc = SVC()

# fit classifier to training set
svc.fit(xtrain, ytrain)

# make predictions on test set
y_pred = svc.predict(xtest)

# Calculate MSE
print('Model MSE: {0:0.4f}'.format(mean_squared_error(ytest, y_pred)))

# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'.format(svc.score(xtest, ytest)))

In [None]:
# Support Vector Machine with high margin
svc = SVC(C = 100, kernel = 'rbf')

# fit classifier to training set
svc.fit(xtrain, ytrain)


# make predictions on test set
y_pred_high = svc.predict(xtest)

# Calculate MSE
print('Model MSE: {0:0.4f}'.format(mean_squared_error(ytest, y_pred_high)))

# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'.format(svc.score(xtest, ytest)))