In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error

from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams

# Gets the box score of every single game (NBA + WNBA + others: nba.com doesn't seperate the leagues) 
# from 2014-2021 into a dataframe.
all_games_finder = leaguegamefinder.LeagueGameFinder()
all_games = all_games_finder.get_data_frames()[0]

# Creates empty dataframe with the same column names passed in from the dataframe containing all the games.
column_names = all_games.columns
games = pd.DataFrame(columns = column_names)

# Appends every game containing an NBA team in the all_games df to the games df. 
nba_teams = teams.get_teams()
for team in nba_teams:
    temp_id = team['id']
    games = games.append(all_games[all_games['TEAM_ID'] == temp_id])

pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
80,22021,1610612737,ATL,Atlanta Hawks,22100293,2021-11-27,ATL vs. NYK,L,240,90,33,93,0.355,9,37,0.243,15,20,0.75,13,39,52,18,8,6,6,17,-9.0
121,22021,1610612737,ATL,Atlanta Hawks,22100285,2021-11-26,ATL @ MEM,W,239,132,52,89,0.584,13,27,0.481,15,21,0.714,9,40,49,33,8,5,12,15,32.0
127,22021,1610612737,ATL,Atlanta Hawks,22100277,2021-11-24,ATL @ SAS,W,239,124,45,88,0.511,12,26,0.462,22,24,0.917,8,36,44,26,10,5,9,11,18.0
171,22021,1610612737,ATL,Atlanta Hawks,22100255,2021-11-22,ATL vs. OKC,W,239,113,42,87,0.483,14,34,0.412,15,16,0.938,8,36,44,25,6,6,7,16,12.0
228,22021,1610612737,ATL,Atlanta Hawks,22100242,2021-11-20,ATL vs. CHA,W,241,115,43,82,0.524,12,34,0.353,17,21,0.81,8,38,46,24,6,6,12,22,10.0


In [2]:
# Dropping any game (two rows in DF) that has any NaN values or is missing either team's stats
games.isna()
games.dropna(inplace=True)

games = games[games.duplicated(subset = ['GAME_ID'], keep=False)]

# Merging games together (previously seperated in the DF by team: each team's stats from the game were kept in seperate rows
games = games.sort_values(by=['GAME_ID'])
games = games.reset_index(drop=True)

# Team A dataframe
tempA = games[games.index % 2 == 0]
tempA2 = games[games.index % 2 == 1]

tempA2 = tempA2.add_prefix('OPP_')

tempA = tempA.reset_index(drop=True)
tempA2 = tempA2.reset_index(drop=True)

a_temp = tempA.join(tempA2)

# Team B dataframe
tempB = games[games.index % 2 == 0]
tempB2 = games[games.index % 2 == 1]

tempB = tempB.add_prefix('OPP_')

tempB = tempB.reset_index(drop=True)
tempB2 = tempB2.reset_index(drop=True)

b_temp = tempB2.join(tempB)

# Adding both teams to main dataframe
games = a_temp.append(b_temp)

# Resorting main dataframe
games = games.sort_values(by=['GAME_ID'])
games = games.reset_index(drop=True)

# Sending data to CSV
games.to_csv('games.csv', index = False)

# Print Head
pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_SEASON_ID,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,OPP_TEAM_NAME,OPP_GAME_ID,OPP_GAME_DATE,OPP_MATCHUP,OPP_WL,OPP_MIN,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS
0,12015,1610612746,LAC,LA Clippers,11500001,2015-10-02,LAC vs. DEN,W,238,103,37,91,0.407,9,32,0.281,20,26,0.769,9,30,39,22,12,6,13,15,7.0,12015,1610612743,DEN,Denver Nuggets,11500001,2015-10-02,DEN @ LAC,L,240,96,37,87,0.425,7,18,0.389,15,20,0.75,14,43,57,16,9,2,23,26,-7.0
1,12015,1610612743,DEN,Denver Nuggets,11500001,2015-10-02,DEN @ LAC,L,240,96,37,87,0.425,7,18,0.389,15,20,0.75,14,43,57,16,9,2,23,26,-7.0,12015,1610612746,LAC,LA Clippers,11500001,2015-10-02,LAC vs. DEN,W,238,103,37,91,0.407,9,32,0.281,20,26,0.769,9,30,39,22,12,6,13,15,7.0
2,12015,1610612753,ORL,Orlando Magic,11500002,2015-10-03,ORL vs. CHA,L,240,100,39,89,0.438,6,26,0.231,16,19,0.842,12,33,45,22,11,3,12,30,-6.0,12015,1610612766,CHA,Charlotte Hornets,11500002,2015-10-03,CHA @ ORL,W,241,106,34,75,0.453,12,31,0.387,26,35,0.743,7,34,41,30,7,2,13,15,6.0
3,12015,1610612766,CHA,Charlotte Hornets,11500002,2015-10-03,CHA @ ORL,W,241,106,34,75,0.453,12,31,0.387,26,35,0.743,7,34,41,30,7,2,13,15,6.0,12015,1610612753,ORL,Orlando Magic,11500002,2015-10-03,ORL vs. CHA,L,240,100,39,89,0.438,6,26,0.231,16,19,0.842,12,33,45,22,11,3,12,30,-6.0
4,12015,1610612754,IND,Indiana Pacers,11500003,2015-10-03,IND vs. NOP,L,241,105,38,92,0.413,10,35,0.286,19,31,0.613,18,44,62,21,10,16,19,27,-5.0,12015,1610612740,NOP,New Orleans Pelicans,11500003,2015-10-03,NOP @ IND,W,242,110,37,103,0.359,11,32,0.344,25,37,0.676,19,36,55,16,9,3,11,22,5.0


In [3]:
# Storing some columns for future use
game_ids = games['GAME_ID'].values
team_ids = games['TEAM_ID'].values
minutes = games['MIN'].values
home_abrv = games['TEAM_ABBREVIATION'].values

# Dropping Non-essential categorical data
games = games.drop(columns=['SEASON_ID', 'OPP_SEASON_ID', 'TEAM_ID', 'OPP_TEAM_ID','GAME_ID', 'OPP_GAME_ID', 'TEAM_ABBREVIATION', 'OPP_TEAM_ABBREVIATION', 'TEAM_NAME', 'OPP_TEAM_NAME', 'MATCHUP', 'OPP_MATCHUP', 'WL', 'OPP_WL', 'GAME_DATE', 'OPP_GAME_DATE', 'MIN', 'OPP_MIN']) 

# Adding minutes back as a single column
games['MIN'] = minutes

# Print Head
pd.set_option('display.max_columns', None)
games.head()

Unnamed: 0,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,OPP_PTS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,OPP_PLUS_MINUS,MIN
0,103,37,91,0.407,9,32,0.281,20,26,0.769,9,30,39,22,12,6,13,15,7.0,96,37,87,0.425,7,18,0.389,15,20,0.75,14,43,57,16,9,2,23,26,-7.0,238
1,96,37,87,0.425,7,18,0.389,15,20,0.75,14,43,57,16,9,2,23,26,-7.0,103,37,91,0.407,9,32,0.281,20,26,0.769,9,30,39,22,12,6,13,15,7.0,240
2,100,39,89,0.438,6,26,0.231,16,19,0.842,12,33,45,22,11,3,12,30,-6.0,106,34,75,0.453,12,31,0.387,26,35,0.743,7,34,41,30,7,2,13,15,6.0,240
3,106,34,75,0.453,12,31,0.387,26,35,0.743,7,34,41,30,7,2,13,15,6.0,100,39,89,0.438,6,26,0.231,16,19,0.842,12,33,45,22,11,3,12,30,-6.0,241
4,105,38,92,0.413,10,35,0.286,19,31,0.613,18,44,62,21,10,16,19,27,-5.0,110,37,103,0.359,11,32,0.344,25,37,0.676,19,36,55,16,9,3,11,22,5.0,241


In [None]:
# # Adding Advanced Stats to enhance model performance, formulas were gathered from various sources. 

# # Efficient Field Goal Percentage
# games['HOME_EFG%'] = (games['HOME_FGM'] + (.5 * games['HOME_FG3M'])) / games['HOME_FGA']
# games['AWAY_EFG%'] = (games['AWAY_FGM'] + (.5 * games['AWAY_FG3M'])) / games['AWAY_FGA']

# # Block Percentage
# games['HOME_BLK%'] = (games['HOME_BLK'] / (games['AWAY_FGA']-games['AWAY_FG3A']))
# games['AWAY_BLK%'] = (games['AWAY_BLK'] / (games['HOME_FGA']-games['HOME_FG3A']))

# # Turnover Percentage
# games['HOME_TOV%'] = games['HOME_TOV'] / (games['HOME_FGA'] + 0.44 * games['HOME_FTA'] + games['HOME_TOV'])
# games['AWAY_TOV%'] = games['AWAY_TOV'] / (games['AWAY_FGA'] + 0.44 * games['AWAY_FTA'] + games['AWAY_TOV'])

# #Offensive Rebound Percentage
# games['HOME_ORB%'] = games['HOME_OREB'] / (games['HOME_OREB'] + games['AWAY_DREB'])
# games['AWAY_ORB%'] = games['AWAY_OREB'] / (games['AWAY_OREB'] + games['HOME_DREB'])

# #Defensive Rebound Percentage
# games['HOME_DREB%'] = games['HOME_DREB'] / (games['AWAY_OREB'] + games['HOME_DREB'])
# games['AWAY_DREB%'] = games['AWAY_DREB'] / (games['HOME_OREB'] + games['AWAY_DREB'])

# # Possessions
# #games["POSS"] = 0.5*((games["FGA"] + 0.4*games["FTA"] - 1.07*(games["OREB"]/(games["OREB"] + games["OREB_other"])) * (games["FGA"] - games["FGM"]) + games["TOV"]) + games["FGA_other"] + 0.4*games["FTA_other"] - 1.07 * (games["OREB_other"] / (games["OREB_other"] + games["DREB"])) * (games["FGA_other"] - games["FGM_other]) + games["TOV_other"]))

# # ***Placeholder POSS***
# games['HOME_POSS'] = 0.96*((games['HOME_FGA']) + games['HOME_TOV'] + 0.44 * games['HOME_FTA'] - games['HOME_OREB'])
# games['AWAY_POSS'] = 0.96*((games['AWAY_FGA']) + games['AWAY_TOV'] + 0.44 * games['AWAY_FTA'] - games['AWAY_OREB'])

# # Steals Percentage
# games['HOME_STL%'] = (games['HOME_STL'] / games['AWAY_POSS'])
# games['AWAY_STL%'] = (games['AWAY_STL'] / games['HOME_POSS'])

# # Free Throw Rate
# games['HOME_FTR'] = games['HOME_FTM'] / games['HOME_FGA']
# games['AWAY_FTR'] = games['AWAY_FTM'] / games['AWAY_FGA']

# # True Shooting (Requires True Shooting Attempts)
# home_tsa = games['HOME_FGA'] + 0.44 * games['HOME_FTA']
# away_tsa = games['AWAY_FGA'] + 0.44 * games['AWAY_FTA']
# games['HOME_TS'] = games['HOME_PTS'] / (2 * home_tsa)
# games['AWAY_TS'] = games['AWAY_PTS'] / (2 * away_tsa)

# # Assist Rate
# games['HOME_ASTR'] = games['HOME_AST'] / (games['HOME_FGA'] + (.44 * games['HOME_FTA']) + games['HOME_AST'] + games['HOME_TOV'])
# games['AWAY_ASTR'] = games['AWAY_AST'] / (games['AWAY_FGA'] + (.44 * games['AWAY_FTA']) + games['AWAY_AST'] + games['AWAY_TOV'])

# # Total Rebound Percentage
# games['HOME_TRB%'] = (games['HOME_REB'] * (games['HOME_REB'] / 5)) / (games['MIN'] * (games['HOME_REB'] + games['AWAY_REB']))
# games['AWAY_TRB%'] = (games['AWAY_REB'] * (games['AWAY_REB'] / 5)) / (games['MIN'] * (games['AWAY_REB'] + games['HOME_REB']))

# # PACE
# games['HOME_PACE'] = 48 * (games['HOME_POSS'] + games['AWAY_POSS']) / (2 * (games['MIN'] / 5))
# games['AWAY_PACE'] = 48 * (games['AWAY_POSS'] + games['HOME_POSS']) / (2 * (games['MIN'] / 5))

# # Offensive Rating
# games['HOME_ORTG'] = (games['HOME_PTS'] / games['HOME_POSS'])
# games['AWAY_ORTG'] = (games['AWAY_PTS'] / games['AWAY_POSS'])

# # Defensive Rating
# games['HOME_DRTG'] = (games['AWAY_PTS'] / games['HOME_POSS'])
# games['AWAY_DRTG'] = (games['HOME_PTS'] / games['AWAY_POSS'])

In [None]:
# Converting entire DF to numeric
games = games.apply(pd.to_numeric, errors='coerce')

# Normalize Data
for col in realcols:
  mean = games[col].mean()
  std = games[col].std()
  games[col] = (games[col] - mean)/std

# Sending data to CSV
games.to_csv('games_updated.csv', index = False)

# Print head
pd.set_option('display.max_columns', None)
games.head()

In [None]:
# Given a matchup of two teams, those teams recent x box scores, and those teams x + 1 spread; make ML model. 

# [recent 15 games box scores averaged], [point spread of 16th game]
# [recent 15 games box scores averaged], [point spread of 17th game]


