In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams

# Gets the box score of every single game (NBA + WNBA + others: nba.com doesn't seperate the leagues) 
# from 2014-2021 into a dataframe.
all_games_finder = leaguegamefinder.LeagueGameFinder()
all_games = all_games_finder.get_data_frames()[0]

# Creates empty dataframe with the same column names passed in from the dataframe containing all the games.
games = pd.DataFrame(all_games)

# Appends every game containing an NBA team in the all_games df to the games df. 
# Note: this double loop might legit get us 50% off... need to come up with a better implementation lmao. 
# It takes like 10 minutes... 
#nba_teams = teams.get_teams()
#for team in nba_teams:
#    temp_id = team['id']
#    for index, row in all_games.iterrows():
#        if row['TEAM_ID'] == temp_id:
#            games = games.append(row, ignore_index = True)

games.columns
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22021,1610612737,ATL,Atlanta Hawks,22100215,2021-11-17,ATL vs. BOS,W,240,110,...,0.833,6,34,40,28,9,4,11,17,11.0
1,22021,1610612738,BOS,Boston Celtics,22100215,2021-11-17,BOS @ ATL,L,242,99,...,0.824,12,30,42,24,9,2,14,17,-11.0
2,52021,1612709911,IWA,Iowa Wolves,2052100054,2021-11-17,IWA vs. GRG,L,241,110,...,0.769,9,29,38,20,6,6,20,24,-11.0
3,52021,1612709917,GRG,Grand Rapids Gold,2052100054,2021-11-17,GRG @ IWA,W,240,121,...,1.0,14,34,48,24,11,4,18,26,11.0
4,52021,1612709903,SLC,Salt Lake City Stars,2052100056,2021-11-17,SLC vs. SCW,,120,61,...,0.25,5,16,21,14,7,6,15,9,-11.0


In [3]:
#This assumes Min is minutes played for each team
#sort the columns to pair each game with the two teams\
#!!!DONT RUN THIS BLOCK MORE THAN ONCE!!!
games = games.sort_values(by=['GAME_ID'])

gamesteam1 = games[games.index % 2 == 0]
gamesteam2 = games[games.index % 2 == 1]

gamesteam1 = gamesteam1.reset_index(drop=True)
gamesteam2 = gamesteam2.reset_index(drop=True)

games = gamesteam1.join(gamesteam2, rsuffix='_other')
print(games.columns)

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'SEASON_ID_other', 'TEAM_ID_other', 'TEAM_ABBREVIATION_other',
       'TEAM_NAME_other', 'GAME_ID_other', 'GAME_DATE_other', 'MATCHUP_other',
       'WL_other', 'MIN_other', 'PTS_other', 'FGM_other', 'FGA_other',
       'FG_PCT_other', 'FG3M_other', 'FG3A_other', 'FG3_PCT_other',
       'FTM_other', 'FTA_other', 'FT_PCT_other', 'OREB_other', 'DREB_other',
       'REB_other', 'AST_other', 'STL_other', 'BLK_other', 'TOV_other',
       'PF_other', 'PLUS_MINUS_other'],
      dtype='object')


In [4]:
#Things to drop later:
#team ID, not relevant and good teams might be identified by this => more bias
#Ironically, block percentage only really correlates to total blocks (maybe not, bc there's a few blocks that also correlate either negatively or positively to blkp)
#Maybe freethrough percent? It correlates to a few other things so I'm not sure If we should really. 
games.drop(columns=['SEASON_ID_other', 'GAME_DATE_other']) #irrelevant and basically an exact duplicate of that in the first column.

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT_other,OREB_other,DREB_other,REB_other,AST_other,STL_other,BLK_other,TOV_other,PF_other,PLUS_MINUS_other
0,12015,1610612743,DEN,Denver Nuggets,0011500001,2015-10-02,DEN @ LAC,L,240,96,...,0.769,9,30,39,22,12,6,13,15,7.0
1,12015,1610612753,ORL,Orlando Magic,0011500002,2015-10-03,ORL vs. CHA,L,240,100,...,0.743,7,34,41,30,7,2,13,15,6.0
2,12015,1610612740,NOP,New Orleans Pelicans,0011500003,2015-10-03,NOP @ IND,W,242,110,...,0.613,18,44,62,21,10,16,19,27,-5.0
3,12015,1610612746,LAC,LA Clippers,0011500005,2015-10-04,LAC @ TOR,L,240,73,...,0.619,14,34,48,11,8,6,13,31,-19.0
4,12015,1610612761,TOR,Toronto Raptors,0011500005,2015-10-04,TOR vs. LAC,W,241,93,...,0.708,12,48,60,20,9,4,14,21,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,72021,1612709911,IWA,Iowa Wolves,2072100001,2021-10-29,IWA vs. GLI,W,241,98,...,1.000,14,34,48,24,11,4,18,26,11.0
14996,72021,1612709930,GLI,G League Ignite,2072100002,2021-10-31,GLI @ SXF,L,241,90,...,0.708,7,33,40,17,9,6,19,22,6.6
14997,72021,1612709930,GLI,G League Ignite,2072100013,2021-11-03,GLI @ SCW,L,241,88,...,0.750,2,5,7,0,2,1,4,2,-4.0
14998,32019,1610616833,GNS,Team Giannis,2531900001,2020-01-23,GNS vs. LBN,L,190,139,...,0.875,10,25,35,20,15,2,12,27,14.0


In [5]:
games["BLKP"] = 100 * (games["BLK"] * (games["MIN"]/5))/ (games["MIN"] * (games["FGA_other"] - games["FG3A_other"])) 

#Turnover Percentage (TOV)             | TOV / (FGA + 0.44 * FTA + TOV)     TOV = turnovers
games["TOVP"] = games["TOV"] / (games["FGA"] + 0.44*games["FTA"] + games["TOV"])

#Offensive Rebound Percentage (ORB)    | ORB / (ORB + Opp DRB)              Opp = oppenent
games["ORBP"] = games["OREB"] / (games["OREB"] + games["DREB_other"])

#Defensive Rebound Percentage (DRB)    | DRB / (Opp ORB + DRB)              Opp = oppenent
games["DREB"] = games["DREB"] / (games["OREB_other"] + games["DREB"])

#(POSS)
# need to get the bloody parenthesis correctly done!
#games["POSS"] = 0.5*((games["FGA"] + 0.4*games["FTA"] - 1.07*(games["OREB"]/(games["OREB"] + games["OREB_other"])) * (games["FGA"] - games["FGM"]) + games["TOV"]) + games["FGA_other"] + 0.4*games["FTA_other"] - 1.07 * (games["OREB_other"] / (games["OREB_other"] + games["DREB"])) * (games["FGA_other"] - games["FGM_other]) + games["TOV_other"]))

# simpler POSS
games["POSS"] = 0.96*((games["FGA"]) + games["TOV"] + 0.44*games["FTA"] - games["OREB"])

#STLP
games["STLP"] = 100 * (games["STL"] * (games["MIN"]/5))/ (games["MIN"] * games["POSS"]) #need POSS feature

#FTR
games["FTR"] = games["FTM"] / games["FGA"]

#TS (you'll need to calculate TSA as well)
games["TSA"] = games["FGA"] + 0.44*games["FTA"]
games["TS"] = games["PTS"]/(2*games["TSA"])

#ASTR
games["ASTR"] = 100*games["AST"]/ (((games["MIN"]/(games["MIN"]/5)) * games["FGM"]) - games["FGM"])

#TRB

#PACE

#PER

print(games.columns)
games.head(10)

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'SEASON_ID_other', 'TEAM_ID_other', 'TEAM_ABBREVIATION_other',
       'TEAM_NAME_other', 'GAME_ID_other', 'GAME_DATE_other', 'MATCHUP_other',
       'WL_other', 'MIN_other', 'PTS_other', 'FGM_other', 'FGA_other',
       'FG_PCT_other', 'FG3M_other', 'FG3A_other', 'FG3_PCT_other',
       'FTM_other', 'FTA_other', 'FT_PCT_other', 'OREB_other', 'DREB_other',
       'REB_other', 'AST_other', 'STL_other', 'BLK_other', 'TOV_other',
       'PF_other', 'PLUS_MINUS_other', 'BLKP', 'TOVP', 'ORBP', 'POSS', 'STLP',
       'FTR', 'TSA', 'TS', 'ASTR'],
      dtype='object')


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,PLUS_MINUS_other,BLKP,TOVP,ORBP,POSS,STLP,FTR,TSA,TS,ASTR
0,12015,1610612743,DEN,Denver Nuggets,11500001,2015-10-02,DEN @ LAC,L,240,96,...,7.0,0.677966,0.193603,0.318182,100.608,1.789122,0.172414,95.8,0.501044,10.810811
1,12015,1610612753,ORL,Orlando Magic,11500002,2015-10-03,ORL vs. CHA,L,240,100,...,6.0,1.363636,0.109729,0.26087,93.4656,2.353807,0.179775,97.36,0.513558,14.102564
2,12015,1610612740,NOP,New Orleans Pelicans,11500003,2015-10-03,NOP @ IND,W,242,110,...,-5.0,1.052632,0.084434,0.301587,106.8288,1.684939,0.242718,119.28,0.4611,10.810811
3,12015,1610612746,LAC,LA Clippers,11500005,2015-10-04,LAC @ TOR,L,240,73,...,-19.0,0.895522,0.231214,0.128205,94.848,1.265182,0.197183,79.8,0.457393,14.814815
4,12015,1610612761,TOR,Toronto Raptors,11500005,2015-10-04,TOR vs. LAC,W,241,93,...,19.0,2.068966,0.176187,0.157895,94.8864,2.52934,0.39726,88.84,0.523413,8.870968
5,12015,1610612766,CHA,Charlotte Hornets,11500006,2015-10-04,CHA @ MIA,W,240,90,...,-13.0,0.307692,0.154381,0.152174,92.7744,1.509037,0.351351,87.64,0.513464,12.5
6,12015,1610612751,BKN,Brooklyn Nets,11500008,2015-10-05,BKN vs. FEN,L,240,96,...,4.0,2.153846,0.105445,0.265306,87.6672,2.737626,0.234568,93.32,0.514359,11.428571
7,12015,1610612761,TOR,Toronto Raptors,11500009,2015-10-05,TOR @ GSW,L,240,87,...,-4.0,0.0,0.128299,0.3125,90.3552,2.213486,0.223529,95.12,0.457317,11.71875
8,12015,1610612744,GSW,Golden State Warriors,11500009,2015-10-05,GSW vs. TOR,W,240,95,...,5.0,2.545455,0.174252,0.117647,89.8176,1.781388,0.257143,80.56,0.589623,20.714286
9,12015,1610612755,PHI,Philadelphia 76ers,11500011,2015-10-06,PHI @ WAS,L,241,95,...,3.0,2.758621,0.203684,0.157895,102.6432,1.169098,0.146341,89.92,0.528247,14.864865


In [6]:
# code below updates the games.csv with the new features added
# and saves it to games_updated.cs

games_csv = games
games_csv.to_csv("games_updated.csv")

In [None]:
#Data Visualizaiton
plt.figure(figsize=(13,13))
sns.heatmap(games.corr(),cmap='coolwarm',vmin=-1)

In [None]:
# ~ TO-DO: 
#  > simple statistics for each feature; mean, median, nth quartile, ...
#  > simple histogram; just to show basic statistics
#  > cleanup, normalize data (???)
#  > udpate document report