In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams

# Gets the box score of every single game (NBA + WNBA + others: nba.com doesn't seperate the leagues) 
# from 2014-2021 into a dataframe.
all_games_finder = leaguegamefinder.LeagueGameFinder()
all_games = all_games_finder.get_data_frames()[0]

# Creates empty dataframe with the same column names passed in from the dataframe containing all the games.
games = pd.DataFrame(all_games)

# Appends every game containing an NBA team in the all_games df to the games df. 
# Note: this double loop might legit get us 50% off... need to come up with a better implementation lmao. 
# It takes like 10 minutes... 
#nba_teams = teams.get_teams()
#for team in nba_teams:
#    temp_id = team['id']
#    for index, row in all_games.iterrows():
#        if row['TEAM_ID'] == temp_id:
#            games = games.append(row, ignore_index = True)

games.columns
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22021,1610612738,BOS,Boston Celtics,22100215,2021-11-17,BOS @ ATL,,180,81,...,0.909,8,21,29,18,8,1,10,12,-14.0
1,22021,1610612737,ATL,Atlanta Hawks,22100215,2021-11-17,ATL vs. BOS,,180,94,...,0.846,4,28,32,24,6,3,9,12,14.0
2,22021,1610612751,BKN,Brooklyn Nets,22100216,2021-11-17,BKN vs. CLE,,169,81,...,0.789,4,29,33,25,6,1,8,13,19.4
3,22021,1610612765,DET,Detroit Pistons,22100214,2021-11-17,DET vs. IND,,217,90,...,0.727,8,30,38,16,13,5,12,18,4.2
4,52021,1612709923,WCB,Windy City Bulls,2052100055,2021-11-17,WCB vs. MCC,,120,61,...,0.714,5,26,31,16,5,3,11,18,1.0


In [2]:
#This assumes Min is minutes played for each team
#sort the columns to pair each game with the two teams\
#!!!DONT RUN THIS BLOCK MORE THAN ONCE!!!
games = games.sort_values(by=['GAME_ID'])

gamesteam1 = games[games.index % 2 == 0]
gamesteam2 = games[games.index % 2 == 1]

gamesteam1 = gamesteam1.reset_index(drop=True)
gamesteam2 = gamesteam2.reset_index(drop=True)

games = gamesteam1.join(gamesteam2, rsuffix='_other')
print(games.columns)

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'SEASON_ID_other', 'TEAM_ID_other', 'TEAM_ABBREVIATION_other',
       'TEAM_NAME_other', 'GAME_ID_other', 'GAME_DATE_other', 'MATCHUP_other',
       'WL_other', 'MIN_other', 'PTS_other', 'FGM_other', 'FGA_other',
       'FG_PCT_other', 'FG3M_other', 'FG3A_other', 'FG3_PCT_other',
       'FTM_other', 'FTA_other', 'FT_PCT_other', 'OREB_other', 'DREB_other',
       'REB_other', 'AST_other', 'STL_other', 'BLK_other', 'TOV_other',
       'PF_other', 'PLUS_MINUS_other'],
      dtype='object')


In [None]:
#Things to drop later:
#team ID, not relevant and good teams might be identified by this => more bias
#Ironically, block percentage only really correlates to total blocks (maybe not, bc there's a few blocks that also correlate either negatively or positively to blkp)
#Maybe freethrough percent? It correlates to a few other things so I'm not sure If we should really. 
games.drop(columns=['SEASON_ID_other', 'GAME_DATE_other']) #irrelevant and basically an exact duplicate of that in the first column.

In [10]:
#Brett Owen feature chunk
#STLP = Steal Percentage = 100 * (steals * (team's minutes played / 5)) / (minutes played * opp possessions)
#games["STLP"] = 100 * (games["STL"] * (games["MIN"]/5))/ (games["MIN"] * games["POSS"]) #need POSS feature

games["BLKP"] = 100 * (games["BLK"] * (games["MIN"]/5))/ (games["MIN"] * (games["FGA_other"] - games["FG3A_other"])) 

#Turnover Percentage (TOV)             | TOV / (FGA + 0.44 * FTA + TOV)     TOV = turnovers
games["TOVP"] = games["TOV"] / (games["FGA"] + 0.44*games["FTA"] + games["TOV"])

#Offensive Rebound Percentage (ORB)    | ORB / (ORB + Opp DRB)              Opp = oppenent
games["ORBP"] = games["OREB"] / (games["OREB"] + games["DREB_other"])

#Defensive Rebound Percentage (DRB)    | DRB / (Opp ORB + DRB)              Opp = oppenent
games["DREB"] = games["DREB"] / (games["OREB_other"] + games["DREB"])

#(POSS)
# need to get the bloody parenthesis correctly done!
#games["POSS"] = 0.5*((games["FGA"] + 0.4*games["FTA"] - 1.07*(games["OREB"]/(games["OREB"] + games["OREB_other"])) * (games["FGA"] - games["FGM"]) + games["TOV"]) + games["FGA_other"] + 0.4*games["FTA_other"] - 1.07 * (games["OREB_other"] / (games["OREB_other"] + games["DREB"])) * (games["FGA_other"] - games["FGM_other]) + games["TOV_other"]))

# simpler POSS
games["POSS"] = 0.96*((games["FGA"]) + games["TOV"] + 0.44*games["FTA"] - games["OREB"])

#FTR
games["FTR"] = games["FTM"] / games["FGA"]

#TS (you'll need to calculate TSA as well)

games["TSA"] = games["FGA"] + 0.44*games["FTA"]
games["TS"] = games["PTS"]/(2*games["TSA"])

#ASTR
games["ASTR"] = 100*games["AST"]/ (((games["MIN"]/(games["MIN"]/5)) * games["FGM"]) - games["FGM"])

#TRB

#PACE

#PER

print(games.columns)
games.head(10)

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'SEASON_ID_other', 'TEAM_ID_other', 'TEAM_ABBREVIATION_other',
       'TEAM_NAME_other', 'GAME_ID_other', 'GAME_DATE_other', 'MATCHUP_other',
       'WL_other', 'MIN_other', 'PTS_other', 'FGM_other', 'FGA_other',
       'FG_PCT_other', 'FG3M_other', 'FG3A_other', 'FG3_PCT_other',
       'FTM_other', 'FTA_other', 'FT_PCT_other', 'OREB_other', 'DREB_other',
       'REB_other', 'AST_other', 'STL_other', 'BLK_other', 'TOV_other',
       'PF_other', 'PLUS_MINUS_other', 'BLKP', 'TOVP', 'ORBP', 'FTR', 'TSA',
       'TS', 'ASTR', 'POSS'],
      dtype='object')


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,PF_other,PLUS_MINUS_other,BLKP,TOVP,ORBP,FTR,TSA,TS,ASTR,POSS
0,12015,1610612743,DEN,Denver Nuggets,11500001,2015-10-02,DEN @ LAC,L,240,96,...,15,7.0,0.677966,0.193603,0.318182,0.172414,95.8,0.501044,10.810811,100.608
1,12015,1610612753,ORL,Orlando Magic,11500002,2015-10-03,ORL vs. CHA,L,240,100,...,15,6.0,1.363636,0.109729,0.26087,0.179775,97.36,0.513558,14.102564,93.4656
2,12015,1610612754,IND,Indiana Pacers,11500003,2015-10-03,IND vs. NOP,L,241,105,...,22,5.0,4.507042,0.152439,0.333333,0.206522,105.64,0.496971,13.815789,102.3744
3,12015,1610612762,UTA,Utah Jazz,11500004,2015-10-04,UTA @ LAL,W,240,90,...,31,-19.0,1.19403,0.129486,0.26087,0.465753,94.12,0.478113,18.518519,92.2752
4,12015,1610612761,TOR,Toronto Raptors,11500005,2015-10-04,TOR vs. LAC,W,241,93,...,29,-20.0,2.142857,0.176187,0.243243,0.39726,88.84,0.523413,8.870968,94.8864
5,12015,1610612748,MIA,Miami Heat,11500006,2015-10-04,MIA vs. CHA,L,240,77,...,18,4.0,1.230769,0.134875,0.2,0.17284,89.8,0.428731,15.833333,91.008
6,12015,1610612766,CHA,Charlotte Hornets,11500006,2015-10-04,CHA @ MIA,W,240,90,...,19,-8.0,0.289855,0.154381,0.175,0.351351,87.64,0.513464,12.5,92.7744
7,12015,1610612757,POR,Portland Trail Blazers,11500007,2015-10-05,POR vs. SAC,L,266,105,...,22,8.0,1.509434,0.178239,0.365385,0.111111,106.04,0.495096,14.285714,105.6384
8,12015,12321,FEN,Istanbul Fenerbahce Sports Club,11500008,2015-10-05,FEN @ BKN,W,240,101,...,40,3.0,1.37931,0.176609,0.238095,0.210526,83.92,0.601764,17.567568,88.2432
9,12015,1610612751,BKN,Brooklyn Nets,11500008,2015-10-05,BKN vs. FEN,L,240,96,...,15,34.0,2.0,0.105445,0.254902,0.234568,93.32,0.514359,11.428571,87.6672


In [8]:
# code below updates the games.csv with the new features added
# and saves it to games_updated.cs

games_csv = games
games_csv.to_csv("games_updated.csv")

In [None]:
#Data Visualizaiton
plt.figure(figsize=(13,13))
sns.heatmap(games.corr(),cmap='coolwarm',vmin=-1)

In [None]:
# ~ TO-DO: 
#  > simple statistics for each feature; mean, median, nth quartile, ...
#  > simple histogram; just to show basic statistics
#  > cleanup, normalize data (???)
#  > udpate document report