In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams

# Gets the box score of every single game (NBA + WNBA + others: nba.com doesn't seperate the leagues) 
# from 2014-2021 into a dataframe.
all_games_finder = leaguegamefinder.LeagueGameFinder()
all_games = all_games_finder.get_data_frames()[0]

# Creates empty dataframe with the same column names passed in from the dataframe containing all the games.
games = pd.DataFrame(all_games)

# Appends every game containing an NBA team in the all_games df to the games df. 
# Note: this double loop might legit get us 50% off... need to come up with a better implementation lmao. 
# It takes like 10 minutes... 
#nba_teams = teams.get_teams()
#for team in nba_teams:
#    temp_id = team['id']
#    for index, row in all_games.iterrows():
#        if row['TEAM_ID'] == temp_id:
#            games = games.append(row, ignore_index = True)

games.columns
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22021,1610612766,CHA,Charlotte Hornets,22100213,2021-11-17,CHA vs. WAS,W,241,97,...,0.5,16,43,59,30,6,8,11,16,10.0
1,22021,1610612754,IND,Indiana Pacers,22100214,2021-11-17,IND @ DET,L,240,89,...,0.737,7,36,43,14,5,5,18,22,-8.0
2,22021,1610612765,DET,Detroit Pistons,22100214,2021-11-17,DET vs. IND,W,240,97,...,0.75,8,34,42,16,13,5,12,20,8.0
3,52021,1612709917,GRG,Grand Rapids Gold,2052100054,2021-11-17,GRG @ IWA,W,240,121,...,1.0,14,34,48,24,11,4,18,26,11.0
4,52021,1612709911,IWA,Iowa Wolves,2052100054,2021-11-17,IWA vs. GRG,L,241,110,...,0.769,9,29,38,20,6,6,20,24,-11.0


In [2]:
#This assumes Min is minutes played for each team
#sort the columns to pair each game with the two teams\
#!!!DONT RUN THIS BLOCK MORE THAN ONCE!!!
games = games.sort_values(by=['GAME_ID'])

gamesteam1 = games[games.index % 2 == 0]
gamesteam2 = games[games.index % 2 == 1]

gamesteam1 = gamesteam1.reset_index(drop=True)
gamesteam2 = gamesteam2.reset_index(drop=True)

games = gamesteam1.join(gamesteam2, rsuffix='_other')
print(games.columns)

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'SEASON_ID_other', 'TEAM_ID_other', 'TEAM_ABBREVIATION_other',
       'TEAM_NAME_other', 'GAME_ID_other', 'GAME_DATE_other', 'MATCHUP_other',
       'WL_other', 'MIN_other', 'PTS_other', 'FGM_other', 'FGA_other',
       'FG_PCT_other', 'FG3M_other', 'FG3A_other', 'FG3_PCT_other',
       'FTM_other', 'FTA_other', 'FT_PCT_other', 'OREB_other', 'DREB_other',
       'REB_other', 'AST_other', 'STL_other', 'BLK_other', 'TOV_other',
       'PF_other', 'PLUS_MINUS_other'],
      dtype='object')


In [4]:
#Things to drop later:
#team ID, not relevant and good teams might be identified by this => more bias
#Ironically, block percentage only really correlates to total blocks (maybe not, bc there's a few blocks that also correlate either negatively or positively to blkp)
#Maybe freethrough percent? It correlates to a few other things so I'm not sure If we should really. 
games.drop(columns=['SEASON_ID_other', 'GAME_DATE_other']) #irrelevant and basically an exact duplicate of that in the first column.

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,POSS_other,STL%,FTR,TSA,TS,ASTR,TRB%,PACE,ORTG,DRTG
0,12015,1610612746,LAC,LA Clippers,0011500001,2015-10-02,LAC vs. DEN,W,238,103,...,100.6080,2.348741,0.219780,102.44,0.502733,14.864865,1.331408,102.247261,100.800138,93.949643
1,12015,1610612740,NOP,New Orleans Pelicans,0011500003,2015-10-03,NOP @ IND,W,242,110,...,92.5440,1.684939,0.242718,119.28,0.461100,10.810811,2.604167,98.862545,102.968488,99.224179
2,12015,1610612754,IND,Indiana Pacers,0011500003,2015-10-03,IND vs. NOP,L,241,105,...,93.4656,1.953613,0.206522,105.64,0.496971,13.815789,2.981347,97.513693,102.564704,97.680670
3,12015,1610612762,UTA,Utah Jazz,0011500004,2015-10-04,UTA @ LAL,W,240,90,...,94.8480,1.950687,0.465753,94.12,0.478113,18.518519,3.225806,93.561600,97.534332,79.111180
4,12015,1610612747,LAL,Los Angeles Lakers,0011500004,2015-10-04,LAL vs. UTA,L,238,71,...,92.7744,1.696526,0.144444,99.24,0.357719,10.576923,1.936134,94.328471,75.283320,95.429560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,72021,1612709911,IWA,Iowa Wolves,2072100001,2021-10-29,IWA vs. GLI,W,241,98,...,103.6416,2.129327,0.072289,87.84,0.557832,14.583333,2.065902,98.374108,104.337013,110.724993
14996,72021,1612709904,SXF,Sioux Falls Skyforce,2072100002,2021-10-31,SXF vs. GLI,W,240,104,...,82.8288,3.237671,0.076923,94.52,0.550148,12.820513,1.458333,87.744000,112.239260,92.813234
14997,72021,1612709902,SCW,Santa Cruz Warriors,2072100013,2021-11-03,SCW vs. GLI,W,239,110,...,94.9632,1.585834,0.121622,77.96,0.705490,14.743590,1.640167,92.005757,124.601276,98.548282
14998,32019,1610616834,LBN,Team LeBron,2531900001,2020-01-23,LBN @ GNS,W,179,139,...,84.4032,1.173489,0.063063,114.52,0.606881,15.566038,3.114669,125.136804,135.929090,88.011641


In [5]:
games["BLK%"] = 100 * (games["BLK"] * (games["MIN"]/5))/ (games["MIN"] * (games["FGA_other"] - games["FG3A_other"])) 

#Turnover Percentage (TOV)             | TOV / (FGA + 0.44 * FTA + TOV)     TOV = turnovers
games["TOV%"] = games["TOV"] / (games["FGA"] + 0.44*games["FTA"] + games["TOV"])

#Offensive Rebound Percentage (ORB)    | ORB / (ORB + Opp DRB)              Opp = oppenent
games["ORB%"] = games["OREB"] / (games["OREB"] + games["DREB_other"])

#Defensive Rebound Percentage (DRB)    | DRB / (Opp ORB + DRB)              Opp = oppenent
games["DREB%"] = games["DREB"] / (games["OREB_other"] + games["DREB"])

#(POSS)
# need to get the bloody parenthesis correctly done!
#games["POSS"] = 0.5*((games["FGA"] + 0.4*games["FTA"] - 1.07*(games["OREB"]/(games["OREB"] + games["OREB_other"])) * (games["FGA"] - games["FGM"]) + games["TOV"]) + games["FGA_other"] + 0.4*games["FTA_other"] - 1.07 * (games["OREB_other"] / (games["OREB_other"] + games["DREB"])) * (games["FGA_other"] - games["FGM_other]) + games["TOV_other"]))

# simpler POSS
games["POSS"] = 0.96*((games["FGA"]) + games["TOV"] + 0.44*games["FTA"] - games["OREB"])
# used for PACE; probably wrong
games["POSS_other"] = 0.96*((games["FGA_other"]) + games["TOV_other"] + 0.44*games["FTA_other"] - games["OREB_other"])

#STLP
games["STL%"] = 100 * (games["STL"] * (games["MIN"]/5))/ (games["MIN"] * games["POSS"]) #need POSS feature

#FTR
games["FTR"] = games["FTM"] / games["FGA"]

#TS (you'll need to calculate TSA as well)
games["TSA"] = games["FGA"] + 0.44*games["FTA"]
games["TS"] = games["PTS"]/(2*games["TSA"])

#ASTR
games["ASTR"] = 100*games["AST"]/ (((games["MIN"]/(games["MIN"]/5)) * games["FGM"]) - games["FGM"])

#TRB
games["TRB%"] = 100*(games["REB"] * (games["REB"]/5))/(games["MIN"] * (games["REB"] + games["REB_other"]))

#PACE
games["PACE"] = 48*((games["POSS"] + games["POSS_other"])/(2*(games["MIN"]/5)))

#ORTG
games["ORTG"] = 100*(games["PTS"]/games["POSS"])

#DRTG
games["DRTG"] = 100*(games["PTS_other"]/games["POSS"])

#PER



print(games.columns)
games.head(10)

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'SEASON_ID_other', 'TEAM_ID_other', 'TEAM_ABBREVIATION_other',
       'TEAM_NAME_other', 'GAME_ID_other', 'GAME_DATE_other', 'MATCHUP_other',
       'WL_other', 'MIN_other', 'PTS_other', 'FGM_other', 'FGA_other',
       'FG_PCT_other', 'FG3M_other', 'FG3A_other', 'FG3_PCT_other',
       'FTM_other', 'FTA_other', 'FT_PCT_other', 'OREB_other', 'DREB_other',
       'REB_other', 'AST_other', 'STL_other', 'BLK_other', 'TOV_other',
       'PF_other', 'PLUS_MINUS_other', 'BLK%', 'TOV%', 'ORB%', 'DREB%', 'POSS',
       'POSS_other', 'STL%', 'FTR', 'TSA', 'TS', 'ASTR', 'TRB%', 'PACE',
       'ORTG', 'DRTG'],
      dtype='object')


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,POSS_other,STL%,FTR,TSA,TS,ASTR,TRB%,PACE,ORTG,DRTG
0,12015,1610612746,LAC,LA Clippers,11500001,2015-10-02,LAC vs. DEN,W,238,103,...,100.608,2.348741,0.21978,102.44,0.502733,14.864865,1.331408,102.247261,100.800138,93.949643
1,12015,1610612740,NOP,New Orleans Pelicans,11500003,2015-10-03,NOP @ IND,W,242,110,...,92.544,1.684939,0.242718,119.28,0.4611,10.810811,2.604167,98.862545,102.968488,99.224179
2,12015,1610612754,IND,Indiana Pacers,11500003,2015-10-03,IND vs. NOP,L,241,105,...,93.4656,1.953613,0.206522,105.64,0.496971,13.815789,2.981347,97.513693,102.564704,97.68067
3,12015,1610612762,UTA,Utah Jazz,11500004,2015-10-04,UTA @ LAL,W,240,90,...,94.848,1.950687,0.465753,94.12,0.478113,18.518519,3.225806,93.5616,97.534332,79.11118
4,12015,1610612747,LAL,Los Angeles Lakers,11500004,2015-10-04,LAL vs. UTA,L,238,71,...,92.7744,1.696526,0.144444,99.24,0.357719,10.576923,1.936134,94.328471,75.28332,95.42956
5,12015,1610612761,TOR,Toronto Raptors,11500005,2015-10-04,TOR vs. LAC,W,241,93,...,88.2432,2.52934,0.39726,88.84,0.523413,8.870968,2.172765,91.184863,98.011938,106.443073
6,12015,1610612748,MIA,Miami Heat,11500006,2015-10-04,MIA vs. CHA,L,240,77,...,87.6672,1.977848,0.17284,89.8,0.428731,15.833333,2.181818,89.3376,84.607947,105.485232
7,12015,1610612757,POR,Portland Trail Blazers,11500007,2015-10-05,POR vs. SAC,L,266,105,...,90.3552,2.082576,0.111111,106.04,0.495096,14.285714,2.033083,88.418165,99.395674,82.356416
8,12015,1610612758,SAC,Sacramento Kings,11500007,2015-10-05,SAC @ POR,W,265,109,...,102.4512,1.85482,0.197802,103.32,0.527487,14.880952,1.986097,95.220408,101.087666,108.506944
9,12015,1610612744,GSW,Golden State Warriors,11500009,2015-10-05,GSW vs. TOR,W,240,95,...,103.9104,1.781388,0.257143,80.56,0.589623,20.714286,1.462607,96.864,105.769916,126.923899


In [6]:
# code below updates the games.csv with the new features added
# and saves it to games_updated.cs

games_csv = games
games_csv.to_csv("games_updated.csv")

In [None]:
#Data Visualizaiton
plt.figure(figsize=(13,13))
sns.heatmap(games.corr(),cmap='coolwarm',vmin=-1)

In [None]:
# ~ TO-DO: 
#  > simple statistics for each feature; mean, median, nth quartile, ...
#  > simple histogram; just to show basic statistics
#  > cleanup, normalize data (???)
#  > udpate document report