In [None]:
import pandas as pd
import numpy as np
import time

from tqdm.auto import tqdm
from basketball_reference_web_scraper.data import OutputType, OutputWriteOption
from basketball_reference_web_scraper.client import players_season_totals, players_advanced_season_totals

In [None]:
player_totals_basic = pd.DataFrame()
for i in range(2000, 2025):
    season_totals = pd.DataFrame(players_season_totals(season_end_year=i))
    season_totals['season_end'] = [i] * len(season_totals)
    player_totals_basic = pd.concat([player_totals_basic, season_totals], axis=0)
player_totals_basic.set_index('season_end', inplace=True)
player_totals_basic.to_csv('2000_2024_player_totals_basic.csv')

In [None]:
player_totals_advanced = pd.DataFrame()
for i in range(2000, 2025):
    season_totals = pd.DataFrame(players_advanced_season_totals(season_end_year=i))
    season_totals['season_end'] = [i] * len(season_totals)
    player_totals_advanced = pd.concat([player_totals_advanced, season_totals], axis=0)
player_totals_advanced.set_index('season_end', inplace=True)
player_totals_advanced.to_csv('2000_2024_player_totals_advanced.csv')

In [None]:
player_totals_basic = pd.read_csv('2000_2024_player_totals_basic.csv', index_col=0)
player_totals_basic

In [None]:
player_totals_basic_cleaned = player_totals_basic.groupby('season_end')
agg_functions = {'slug': 'last', 'positions': 'last', 'age': 'last', 'team': 'last'}
agg_functions.update(dict(zip(player_totals_basic.columns[5:], ['sum'] * len(player_totals_basic.columns[5:]))))
player_totals_basic_cleaned = player_totals_basic_cleaned.transform(lambda x: x).groupby(['season_end', 'name']).aggregate(agg_functions)
player_totals_basic_cleaned['made_two_point_field_goals'] = player_totals_basic_cleaned['made_field_goals'] - player_totals_basic_cleaned['made_three_point_field_goals']
player_totals_basic_cleaned['attempted_two_point_field_goals'] = player_totals_basic_cleaned['attempted_field_goals'] - player_totals_basic_cleaned['attempted_three_point_field_goals']
player_totals_basic_cleaned

In [None]:
player_totals_basic_pg = pd.DataFrame()
games_played = player_totals_basic_cleaned['games_played']
player_totals_basic_pg['MP/G'] = player_totals_basic_cleaned['minutes_played'] / games_played
player_totals_basic_pg['GS%'] = player_totals_basic_cleaned['games_started'] / games_played
player_totals_basic_pg['PS/G'] = player_totals_basic_cleaned['points'] / games_played
player_totals_basic_pg['FG/G'] = player_totals_basic_cleaned['made_field_goals'] / games_played
player_totals_basic_pg['FGA'] = player_totals_basic_cleaned['attempted_field_goals'] / games_played
player_totals_basic_pg['FG%'] = player_totals_basic_cleaned['made_field_goals'] / player_totals_basic_cleaned['attempted_field_goals']
player_totals_basic_pg['2P/G'] = player_totals_basic_cleaned['made_two_point_field_goals'] / games_played
player_totals_basic_pg['2PA/G'] = player_totals_basic_cleaned['attempted_two_point_field_goals'] / games_played
player_totals_basic_pg['2P%'] = player_totals_basic_cleaned['made_two_point_field_goals'] / player_totals_basic_cleaned['attempted_two_point_field_goals']
player_totals_basic_pg['3P/G'] = player_totals_basic_cleaned['made_three_point_field_goals'] / games_played
player_totals_basic_pg['3PA/G'] = player_totals_basic_cleaned['attempted_three_point_field_goals'] / games_played
player_totals_basic_pg['3P%'] = player_totals_basic_cleaned['made_three_point_field_goals'] / player_totals_basic_cleaned['attempted_three_point_field_goals']
player_totals_basic_pg['FT/G'] = player_totals_basic_cleaned['made_free_throws'] / games_played
player_totals_basic_pg['FTA/G'] = player_totals_basic_cleaned['attempted_free_throws'] / games_played
player_totals_basic_pg['FT%'] = player_totals_basic_cleaned['made_free_throws'] / player_totals_basic_cleaned['attempted_free_throws']
player_totals_basic_pg['ORB/G'] = player_totals_basic_cleaned['offensive_rebounds'] / games_played
player_totals_basic_pg['DRB/G'] = player_totals_basic_cleaned['defensive_rebounds'] / games_played
player_totals_basic_pg['TRB/G'] = (player_totals_basic_cleaned['offensive_rebounds'] + player_totals_basic_cleaned['defensive_rebounds']) / games_played
player_totals_basic_pg['AST/G'] = player_totals_basic_cleaned['assists'] / games_played
player_totals_basic_pg['STL/G'] = player_totals_basic_cleaned['steals'] / games_played
player_totals_basic_pg['BLK/G'] = player_totals_basic_cleaned['blocks'] / games_played
player_totals_basic_pg['TOV/G'] = player_totals_basic_cleaned['turnovers'] / games_played
player_totals_basic_pg

In [None]:
player_totals_advanced = pd.read_csv('2000_2024_player_totals_advanced.csv', index_col=0)
player_totals_advanced

In [None]:
player_totals_advanced_cleaned = player_totals_advanced.groupby('season_end')
agg_functions = {'slug': 'last', 'positions': 'last', 'age': 'last', 'team': 'last'}
agg_functions.update(dict(zip(player_totals_advanced.columns[5:], ['sum'] * len(player_totals_advanced.columns[5:]))))
player_totals_advanced_cleaned = player_totals_advanced_cleaned.transform(lambda x: x).groupby(['season_end', 'name']).aggregate(agg_functions)
player_totals_advanced_cleaned

In [84]:
player_totals_advanced_pg = pd.DataFrame()
games_played = player_totals_basic_cleaned['games_played']
player_totals_advanced_pg['PER'] = player_totals_advanced_cleaned['player_efficiency_rating']
player_totals_advanced_pg['TS%'] = player_totals_advanced_cleaned['true_shooting_percentage']
player_totals_advanced_pg['eFG%'] = (player_totals_basic_cleaned['made_two_point_field_goals'] + 1.5 * player_totals_basic_cleaned['made_three_point_field_goals']) / player_totals_basic_cleaned['attempted_field_goals']
player_totals_advanced_pg['3PAr'] = player_totals_basic_cleaned['attempted_three_point_field_goals'] / player_totals_basic_cleaned['attempted_field_goals']
player_totals_advanced_pg['FTr'] = player_totals_basic_cleaned['attempted_free_throws'] / player_totals_basic_cleaned['attempted_field_goals']
player_totals_advanced_pg['ORB%'] = player_totals_advanced_cleaned['offensive_rebound_percentage']
player_totals_advanced_pg['DRB%'] = player_totals_advanced_cleaned['defensive_rebound_percentage']
player_totals_advanced_pg['TRB%'] = player_totals_advanced_cleaned['total_rebound_percentage']
player_totals_advanced_pg['AST%'] = player_totals_advanced_cleaned['assist_percentage']
player_totals_advanced_pg['USG%'] = player_totals_advanced_cleaned['usage_percentage']
player_totals_advanced_pg['STL%'] = player_totals_advanced_cleaned['steal_percentage']
player_totals_advanced_pg['BLK%'] = player_totals_advanced_cleaned['block_percentage']
player_totals_advanced_pg['TOV%'] = player_totals_advanced_cleaned['turnover_percentage']
player_totals_advanced_pg

Unnamed: 0_level_0,Unnamed: 1_level_0,PER,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,USG%,STL%,BLK%,TOV%
season_end,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2000,A.C. Green,11.2,0.482,0.448320,0.010336,0.245478,9.0,17.9,13.5,5.9,11.0,1.4,0.7,11.0
2000,A.J. Bramlett,-0.4,0.190,0.190476,0.000000,0.000000,21.7,18.5,20.1,0.0,17.1,0.8,0.0,12.5
2000,Aaron McKie,13.2,0.497,0.448567,0.204047,0.246206,2.6,11.4,7.0,19.5,17.3,2.9,0.7,14.7
2000,Aaron Williams,17.7,0.572,0.522222,0.006667,0.446667,11.7,18.9,15.2,6.2,17.8,1.4,4.3,12.9
2000,Adam Keefe,7.6,0.463,0.407692,0.007692,0.276923,9.3,18.0,13.7,8.5,14.7,1.5,1.7,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,Zach Collins,14.1,0.585,0.550162,0.352751,0.249191,7.7,17.1,12.3,18.6,21.4,1.0,2.6,17.2
2024,Zach LaVine,15.8,0.578,0.528169,0.450704,0.287324,1.1,16.1,8.4,17.3,24.1,1.3,1.0,10.5
2024,Zeke Nnaji,11.7,0.544,0.505952,0.178571,0.464286,13.1,12.0,12.6,6.5,16.1,1.2,5.2,17.2
2024,Ziaire Williams,9.9,0.526,0.490421,0.498084,0.183908,3.8,16.7,10.1,10.8,20.0,1.3,1.2,13.8


In [72]:
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.static import players

def get_shot_data(player_full_name: str, season: str):
    player_dictionary = players.get_players()
    try:
        player_info = [player for player in player_dictionary if player['full_name'] == player_full_name][0]
    except Exception as e:
        return None
    player_id = player_info['id']

    player_shotlog = shotchartdetail.ShotChartDetail(team_id = 0, player_id = player_id, season_nullable=season, context_measure_simple = 'FGA', season_type_all_star = ['Regular Season'])

    player_df = player_shotlog.get_data_frames()[0]
    return player_df

In [74]:
shot_data_cleaned = pd.DataFrame()
shot_data = get_shot_data('LeBron James', season='2023-24')
shot_data

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM
0,Shot Chart Detail,0022300015,7,2544,LeBron James,1610612747,Los Angeles Lakers,1,11,40,...,Center(C),16-24 ft.,17,-47,172,1,0,20231110,PHX,LAL
1,Shot Chart Detail,0022300015,11,2544,LeBron James,1610612747,Los Angeles Lakers,1,11,12,...,Center(C),Less Than 8 ft.,0,2,5,1,1,20231110,PHX,LAL
2,Shot Chart Detail,0022300015,20,2544,LeBron James,1610612747,Los Angeles Lakers,1,10,1,...,Left Side(L),8-16 ft.,12,-119,32,1,0,20231110,PHX,LAL
3,Shot Chart Detail,0022300015,36,2544,LeBron James,1610612747,Los Angeles Lakers,1,9,0,...,Center(C),Less Than 8 ft.,0,1,8,1,1,20231110,PHX,LAL
4,Shot Chart Detail,0022300015,72,2544,LeBron James,1610612747,Los Angeles Lakers,1,5,18,...,Left Side(L),8-16 ft.,12,-82,89,1,0,20231110,PHX,LAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669,Shot Chart Detail,0022301230,233,2544,LeBron James,1610612747,Los Angeles Lakers,2,7,6,...,Center(C),8-16 ft.,9,-1,91,1,1,20231207,LAL,NOP
670,Shot Chart Detail,0022301230,300,2544,LeBron James,1610612747,Los Angeles Lakers,2,2,30,...,Center(C),Less Than 8 ft.,1,10,2,1,1,20231207,LAL,NOP
671,Shot Chart Detail,0022301230,340,2544,LeBron James,1610612747,Los Angeles Lakers,3,11,24,...,Center(C),Less Than 8 ft.,3,-36,1,1,1,20231207,LAL,NOP
672,Shot Chart Detail,0022301230,380,2544,LeBron James,1610612747,Los Angeles Lakers,3,8,5,...,Center(C),24+ ft.,26,23,269,1,1,20231207,LAL,NOP


In [88]:
shot_data_aggregated = pd.DataFrame()
shot_data_aggregated['shot_made'] = shot_data.groupby('SHOT_ZONE_BASIC')['SHOT_MADE_FLAG'].sum()
shot_data_aggregated['shots_attempted'] = shot_data.groupby('SHOT_ZONE_BASIC')['SHOT_ATTEMPTED_FLAG'].sum()
shot_data_aggregated['shots_pct'] = shot_data_aggregated['shot_made'] / shot_data_aggregated['shots_attempted']
shot_data_aggregated = shot_data_aggregated.unstack().reorder_levels([1, 0]).sort_index()

KeyError: '2009'

In [98]:
player_totals_advanced_pg.groupby('season_end').get_group(2000).index.get_level_values('name').unique()
# player_totals_advanced_pg

Index(['A.C. Green', 'A.J. Bramlett', 'Aaron McKie', 'Aaron Williams',
       'Adam Keefe', 'Adonal Foyle', 'Adrian Griffin', 'Al Harrington',
       'Alan Henderson', 'Aleksandar Radojević',
       ...
       'Vonteego Cummings', 'Voshon Lenard', 'Wally Szczerbiak',
       'Walt Williams', 'Walter McCarty', 'Wayne Turner', 'Wesley Person',
       'Will Perdue', 'William Avery', 'Žan Tabak'],
      dtype='object', name='name', length=439)

In [99]:
shot_data_agg = pd.DataFrame()
for i in tqdm(range(8, 25)):
    season_start = f'200{i}' if i < 10 else f'20{i}'
    season_end = f'0{i+1}' if i < 10 else f'{i+1}'
    season_end_idx = f'20{season_end}'
    season = f'{season_start}-{season_end}'
    for player in player_totals_advanced_pg.groupby('season_end').get_group(int(season_end_idx)).index.get_level_values('name').unique():
        failed = 0
        while(True):
            try:
                player_shot_data = get_shot_data(player, season)
                if player_shot_data is None or len(player_shot_data) == 0:
                    print(f'Player data not found: {player}')
                    break
                player_shot_data_agg = pd.DataFrame()
                player_shot_data_agg['shot_made'] = player_shot_data.groupby('SHOT_ZONE_BASIC')['SHOT_MADE_FLAG'].sum()
                player_shot_data_agg['shots_attempted'] = player_shot_data.groupby('SHOT_ZONE_BASIC')['SHOT_ATTEMPTED_FLAG'].sum()
                player_shot_data_agg['shots_pct'] = player_shot_data_agg['shot_made'] / player_shot_data_agg['shots_attempted']
                player_shot_data_series = player_shot_data_agg.unstack().reorder_levels([1, 0]).sort_index()
                player_shot_data_agg = pd.DataFrame()
                player_shot_data_agg['METRICS'] = player_shot_data_series
                player_shot_data_agg['season_end'] = len(player_shot_data_agg) * [f'200{season_end}' if int(season_end) < 10 else f'20{season_end}']
                player_shot_data_agg['player'] = len(player_shot_data_agg) * [player]
                player_shot_data_agg.set_index(['player', 'season_end'], inplace=True, append=True)
                shot_data_agg = pd.concat([shot_data_agg, player_shot_data_agg])
                shot_data_agg.to_csv('2008_2024_player_shot_metrics.csv')
            except Exception as e:
                failed+=1
                print(f'Failed {failed} {"times" if failed > 1 else "time"}. Trying again...')
                print(f'Error displayed: {e}')
                time.sleep(30)
                continue
            break

  0%|          | 0/17 [00:00<?, ?it/s]

Player data not found: Alexis Ajinça
Player data not found: Anderson Varejão
Player data not found: Andris Biedriņš
Player data not found: Andrés Nocioni
Player data not found: C.J. Miles
Player data not found: D.J. Mbenga
Player data not found: D.J. White
Player data not found: Darko Miličić
Player data not found: Dee Brown
Player data not found: Donté Greene
Player data not found: Eduardo Nájera
Player data not found: Francisco García
Player data not found: Goran Dragić
Player data not found: Hedo Türkoğlu
Player data not found: J.J. Hickson
Player data not found: J.J. Redick
Player data not found: J.R. Smith
Player data not found: José Calderón
Player data not found: Manu Ginóbili
Player data not found: Marko Jarić
Player data not found: Mickaël Piétrus
Player data not found: Nenad Krstić
Player data not found: Nenê
Player data not found: Peja Stojaković
Player data not found: Rasho Nesterović
Player data not found: Roger Mason
Player data not found: Roko Ukić
Player data not found:

KeyError: 20010