In [152]:
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import playerfantasyprofile
from nba_api.stats.static import players
import time
import pandas as pd
#from scipy.stats import zscore

# Access NBA API via NBApy packages.
-- players.get_active_players: dictionary of players and ids

-- Create list of dataframes of each player's game logs using a loop. 

-- The game logs are generated by playergamelog.PlayerGameLog(). 

-- The get_data_frames()  call retrieves the game log as a dataframe wrapped in a list, so access with hash 0.

-- pd.concat() concatenates the new list of player gamelog dfs into one big df

-- time.sleep(0.600) allows the call not to timeout; however, I don't want to wait 5 minutes every time I work on this, so I'm saving the df to a csv, which I'll access hereafter and comment this loop portion out. Of course, I can do this because the NBA season is over. Mid-seaason, I'd have to pull the data again each day.

In [153]:
player_dict = players.get_active_players()

In [154]:
# plgs = []
# for i in range(len(player_dict)):
#     pid = player_dict[i]['id']
#     plog = playergamelog.PlayerGameLog(player_id = pid,
#                                           season = '2020-21',
#                                           season_type_all_star = 'Regular Season')
#     plogdf = plog.get_data_frames()[0]
#     time.sleep(0.600)
#     plgs.append(plogdf)
# plogs21 = pd.concat(plgs)
# plogs21.to_csv('player_logs_2021', index = False)

# Prepare player logs dataframe (plogs)

-- read in CSV (saved from earlier code)

-- create new dictionary from player_dict that just has player ids as keys and full names as values

-- use id_dict to add player names column to plogs

-- drop unnecessary column

-- use MATCHUP column to determine player's team, opponent, and whether the game was home or away

-- rename columns not consistent with convention

-- convert GAME_DATE to datetime

In [155]:
plogs = pd.read_csv('player_logs_2021')

id_dict = {}
for d in player_dict:
    new_key = d['id']
    id_dict[new_key] = d['full_name']
    
plogs['PLAYER'] = plogs['Player_ID'].map(id_dict)
plogs.drop(columns = 'VIDEO_AVAILABLE', inplace = True)
plogs['TEAM'] = plogs['MATCHUP'].str.split(pat = "vs\.|\@", expand = True)[0]
plogs['OPPONENT'] = plogs['MATCHUP'].str.split(pat = "vs\.|\@", expand = True)[1]
plogs['HOME_AWAY'] = plogs['MATCHUP'].apply(lambda x: 'AWAY' if '@' in x else 'HOME')
plogs.rename(columns = {'Player_ID': 'PLAYER_ID', 'Game_ID': 'GAME_ID'}, inplace = True)
plogs['GAME_DATE'] = pd.to_datetime(plogs['GAME_DATE'])

# Prepare season averages dataframe (season) and season z-score dataframe (seasonz)

-- preset stats column names

-- create season df using groupby and mean

-- recalculate FG_PCT and FT_PCT to represent season averages rather than per-game averages

-- create seasonz by using classic z-score formula (which handles NANs better than scipy's zscore formula.

-- _NOTE THAT FG_PCT AND FT_PCT MUST BE ADJUSTED BY VOLUME. RETURN TO THIS!!!_

-- pop PLAYER_ID and PLAYER out of index, convert some data types

In [156]:
stats = 'PTS','FGM','FGA', 'FG_PCT', 'FG3M', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'MIN', 'PLUS_MINUS'
season = plogs.groupby(['PLAYER_ID', 'PLAYER'])[stats].mean()
season['FG_PCT'] = season['FGM'] / season['FGA']
season['FT_PCT'] = season['FTM'] / season['FTA']
seasonz = (season - season.mean()) / season.std(ddof = 0)

to_convert = ['SEASON_ID', 'PLAYER_ID', 'GAME_ID', 'WL', 'PLAYER', 'TEAM', 'OPPONENT', 'HOME_AWAY']
plogs[to_convert] = plogs[to_convert].astype('category')

to_convert = ['PLAYER_ID', 'PLAYER']
seasonz = seasonz.reset_index()
seasonz[to_convert] = seasonz[to_convert].astype('category')
season = season.reset_index()
season[to_convert] = seasonz[to_convert].astype('category')

  season = plogs.groupby(['PLAYER_ID', 'PLAYER'])[stats].mean()


In [157]:
print(plogs.info())
print(season.info())
print(seasonz.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19757 entries, 0 to 19756
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   SEASON_ID   19757 non-null  category      
 1   PLAYER_ID   19757 non-null  category      
 2   GAME_ID     19757 non-null  category      
 3   GAME_DATE   19757 non-null  datetime64[ns]
 4   MATCHUP     19757 non-null  object        
 5   WL          19757 non-null  category      
 6   MIN         19757 non-null  int64         
 7   FGM         19757 non-null  int64         
 8   FGA         19757 non-null  int64         
 9   FG_PCT      19757 non-null  float64       
 10  FG3M        19757 non-null  int64         
 11  FG3A        19757 non-null  int64         
 12  FG3_PCT     19757 non-null  float64       
 13  FTM         19757 non-null  int64         
 14  FTA         19757 non-null  int64         
 15  FT_PCT      19757 non-null  float64       
 16  OREB        19757 non-