In [4]:
from nba_api.stats.endpoints import ShotChartDetail, LeagueGameLog, BoxScorePlayerTrackV3, BoxScoreAdvancedV3, playbyplayv3, SynergyPlayTypes, BoxScoreTraditionalV3
import pandas as pd
import time
#seasons = ['2020-21','2021-22','2022-23','2023-24']
seasons = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20','2020-21','2021-22','2022-23','2023-24']

# Data Fetching
Data gathered through the NBA API with the basic pattern for most endpoints being get the games in a season and query the endpoint for each game. Play type data is not done on a game by game basis so is retrived sepratly. 

In [5]:
def get_game_ids(season, season_type='Regular Season'):
    game_log = LeagueGameLog(season=season, season_type_all_star=season_type).get_data_frames()[0]
    game_ids = game_log['GAME_ID'].unique().tolist()
    return game_ids

def get_data(game_ids):
    play_by_play_data = []
    shotchart_data = []
    playertracking_data = []
    advanced_data = []
    box_data = []
    
    for game_id in game_ids:
        try:
            #standard box
            try:
                box = BoxScoreTraditionalV3(game_id = game_id).get_data_frames()[0]
                box['GAME_ID'] = game_id
                box_data.append(box)
            except Exception as e:
                print(f"Failed box traditional for {game_id}: {e}")
                time.sleep(1)
            
            #play by play
            try:
                play_by_play = playbyplayv3.PlayByPlayV3(game_id=game_id).get_data_frames()[0]
                play_by_play['GAME_ID'] = game_id
                play_by_play_data.append(play_by_play)
            except Exception as e:
                print(f"Failed PlayByPlayV2 for {game_id}: {e}")
                time.sleep(1)
                
            #ShotChartDetail
            try:
                shotchart = ShotChartDetail(game_id_nullable=game_id, team_id=0, player_id=0,context_measure_simple='FGA').get_data_frames()[0]
                shotchart['GAME_ID'] = game_id
                shotchart_data.append(shotchart)
            except Exception as e:
                print(f"Failed ShotChartDetail for {game_id}: {e}")
                time.sleep(1) 
            
            #BoxScorePlayerTrackV3
            try:
                player_tracking = BoxScorePlayerTrackV3(game_id=game_id).get_data_frames()[0]
                player_tracking['GAME_ID'] = game_id
                playertracking_data.append(player_tracking)
            except Exception as e:
                print(f"Failed BoxScorePlayerTrackV3 for {game_id}: {e}")
                time.sleep(1)
            
            #BoxScoreAdvancedV3
            try:
                advanced_boxscore = BoxScoreAdvancedV3(game_id=game_id).get_data_frames()[0]
                advanced_boxscore['GAME_ID'] = game_id
                advanced_data.append(advanced_boxscore)
            except Exception as e:
                print(f"Failed BoxScoreAdvancedV3 for {game_id}: {e}")
                time.sleep(1)
            
            # rate limiting
            time.sleep(0.6)
        
        except Exception as e:
            print(f"Unexpected error for game {game_id}: {e}")
            time.sleep(10) 
    
    # Concat dataframes only if data exists
    shotchart_df = pd.concat(shotchart_data, ignore_index=True) if shotchart_data else pd.DataFrame()
    playertracking_df = pd.concat(playertracking_data, ignore_index=True) if playertracking_data else pd.DataFrame()
    advanced_df = pd.concat(advanced_data, ignore_index=True) if advanced_data else pd.DataFrame()
    play_by_play_df = pd.concat(play_by_play_data, ignore_index=True) if play_by_play_data else pd.DataFrame()
    
    return shotchart_df, playertracking_df, advanced_df,play_by_play_df

for season in seasons:
    game_ids = get_game_ids(season = season)


    shotchart_df, playertracking_df,advanced_box_df,play_by_play_df = get_data(game_ids)
    
    #names
    shotchart_file = f"{season}_shotchart_data.csv"
    playertracking_file = f"{season}_playertracking_data.csv"
    advanced_box_file = f"{season}_advanced_box_data.csv"
    play_by_play_file = f'{season}_play_by_play.csv'
    
    shotchart_df.to_csv(shotchart_file, index=False)
    playertracking_df.to_csv(playertracking_file, index=False)
    advanced_box_df.to_csv(advanced_box_file,index=False)
    play_by_play_df.to_csv(play_by_play_file, index=False)
    


This takes a while NBA api rate limiting is brutal.

In [6]:

for season in seasons:
    game_ids = get_game_ids(season = season)


    shotchart_df, playertracking_df,advanced_box_df,play_by_play_df = get_data(game_ids)
    
    #names
    shotchart_file = f"{season}_shotchart_data.csv"
    playertracking_file = f"{season}_playertracking_data.csv"
    advanced_box_file = f"{season}_advanced_box_data.csv"
    play_by_play_file = f'{season}_play_by_play.csv'
    
    shotchart_df.to_csv(shotchart_file, index=False)
    playertracking_df.to_csv(playertracking_file, index=False)
    advanced_box_df.to_csv(advanced_box_file,index=False)
    play_by_play_df.to_csv(play_by_play_file, index=False)
    

Failed PlayByPlayV2 for 0021500192: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.
Failed box traditional for 0021500241: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021500242: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021700079: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.
Failed BoxScoreAdvancedV3 for 0021700120: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021700319: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed box traditional for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/boxscoretraditionalv3?EndPeriod=0&EndRange=0&GameID=0021700370&RangeType=0&StartPeriod=0&StartRange=0 (Caused by NewConnectionError('<urllib3.connection.HTTPSC

KeyboardInterrupt: 

## To fix
Failed PlayByPlayV2 for 0022100301: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.
Failed PlayByPlayV2 for 0022100506: Expecting value: line 1 column 1 (char 0)
Failed PlayByPlayV2 for 0022200707: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

Failed PlayByPlayV2 for 0021500192: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.
Failed box traditional for 0021500241: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021500242: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021700079: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.
Failed BoxScoreAdvancedV3 for 0021700120: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021700319: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed box traditional for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/boxscoretraditionalv3?EndPeriod=0&EndRange=0&GameID=0021700370&RangeType=0&StartPeriod=0&StartRange=0 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f23658b50>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))
Failed PlayByPlayV2 for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/playbyplayv3?EndPeriod=0&GameID=0021700370&StartPeriod=0 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f23671790>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))
Failed ShotChartDetail for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/shotchartdetail?AheadBehind=&ClutchTime=&ContextFilter=&ContextMeasure=FGA&DateFrom=&DateTo=&EndPeriod=&EndRange=&GameID=0021700370&GameSegment=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&Period=0&PlayerID=0&PlayerPosition=&PointDiff=&Position=&RangeType=&RookieYear=&Season=&SeasonSegment=&SeasonType=Regular+Season&StartPeriod=&StartRange=&TeamID=0&VsConference=&VsDivision= (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f2365a210>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))
Failed BoxScorePlayerTrackV3 for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/boxscoreplayertrackv3?GameID=0021700370 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f236721d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))
Failed BoxScorePlayerTrackV3 for 0021700558: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed ShotChartDetail for 0021700916: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021800264: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.
Failed ShotChartDetail for 0021900588: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Failed PlayByPlayV2 for 0021900799: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.
### Get playType data

In [7]:
league_id = '00'  # NBA
season_type = 'Regular Season'
per_mode = 'PerGame'
player_or_team = 'P'
play_types = ['Cut', 'Handoff', 'Isolation', 'Misc', 'OffScreen', 'Postup', 'PRBallHandler', 'PRRollman', 'OffRebound', 'Spotup', 'Transition']
type_groupings = ['offensive', 'defensive']

for season in seasons:
    synergy_data = [] 
    
    for type_grouping in type_groupings:
        for play_type in play_types:
            try:
                # Fetch the synergy data
                synergy = SynergyPlayTypes(
                    league_id=league_id,
                    per_mode_simple=per_mode,
                    player_or_team_abbreviation=player_or_team,
                    season_type_all_star=season_type,
                    season=season,
                    play_type_nullable=play_type,
                    type_grouping_nullable=type_grouping
                ).get_data_frames()[0]
                
                synergy_data.append(synergy)
                time.sleep(0.6)  # Delay for rate limiting

            except Exception as e:
                print(f"Failed Synergy for season: {season}, play type: {play_type}, type grouping: {type_grouping}: {e}")
                time.sleep(1)  # Delay before retrying on error

    
    synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()

    # Save
    play_type_file = f'{season}_play_type.csv'
    synergy_df.to_csv(play_type_file, index=False)




        

  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if synergy_data else pd.DataFrame()
  synergy_df = pd.concat(synergy_data, ignore_index=True) if

In [8]:
box = BoxScoreTraditionalV3(season='2013-14').get_data_frames()[0]

TypeError: BoxScoreTraditionalV3.__init__() got an unexpected keyword argument 'season'

In [11]:
asdf = "Failed PlayByPlayV2 for 0022100301: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.Failed PlayByPlayV2 for 0022100506: Expecting value: line 1 column 1 (char 0)Failed PlayByPlayV2 for 0022200707: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed PlayByPlayV2 for 0021500192: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.Failed box traditional for 0021500241: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed PlayByPlayV2 for 0021500242: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed PlayByPlayV2 for 0021700079: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.Failed BoxScoreAdvancedV3 for 0021700120: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed PlayByPlayV2 for 0021700319: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed box traditional for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/boxscoretraditionalv3?EndPeriod=0&EndRange=0&GameID=0021700370&RangeType=0&StartPeriod=0&StartRange=0 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f23658b50>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))Failed PlayByPlayV2 for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/playbyplayv3?EndPeriod=0&GameID=0021700370&StartPeriod=0 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f23671790>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))Failed ShotChartDetail for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/shotchartdetail?AheadBehind=&ClutchTime=&ContextFilter=&ContextMeasure=FGA&DateFrom=&DateTo=&EndPeriod=&EndRange=&GameID=0021700370&GameSegment=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&Period=0&PlayerID=0&PlayerPosition=&PointDiff=&Position=&RangeType=&RookieYear=&Season=&SeasonSegment=&SeasonType=Regular+Season&StartPeriod=&StartRange=&TeamID=0&VsConference=&VsDivision= (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f2365a210>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))Failed BoxScorePlayerTrackV3 for 0021700370: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/boxscoreplayertrackv3?GameID=0021700370 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6f236721d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))Failed BoxScorePlayerTrackV3 for 0021700558: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed ShotChartDetail for 0021700916: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed PlayByPlayV2 for 0021800264: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out.Failed ShotChartDetail for 0021900588: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)Failed PlayByPlayV2 for 0021900799: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out."

In [12]:
import re

def parse_failed_ids(asdf):
    failed_ids = {
        "BoxScoreTraditional": [],
        "PlayByPlay": [],
        "ShotChartDetail": [],
        "BoxScorePlayerTrack": [],
        "BoxScoreAdvanced": []
    }
    
    # Define regex patterns for each category
    patterns = {
        "BoxScoreTraditional": r"Failed box traditional for (\d+):",
        "PlayByPlay": r"Failed PlayByPlayV2 for (\d+):",
        "ShotChartDetail": r"Failed ShotChartDetail for (\d+):",
        "BoxScorePlayerTrack": r"Failed BoxScorePlayerTrackV3 for (\d+):",
        "BoxScoreAdvanced": r"Failed BoxScoreAdvancedV3 for (\d+):"
    }
    
    for category, pattern in patterns.items():
        matches = re.findall(pattern, asdf)
        failed_ids[category].extend(matches)
    
    return failed_ids


failed_ids = parse_failed_ids(asdf)
print(failed_ids)


{'BoxScoreTraditional': ['0021500241', '0021700370'], 'PlayByPlay': ['0022100301', '0022100506', '0022200707', '0021500192', '0021500242', '0021700079', '0021700319', '0021700370', '0021800264', '0021900799'], 'ShotChartDetail': ['0021700370', '0021700916', '0021900588'], 'BoxScorePlayerTrack': ['0021700370', '0021700558'], 'BoxScoreAdvanced': ['0021700120']}


In [14]:
#GPT

import pandas as pd
import os
from time import sleep

# Define the fetch functions for each category
fetch_functions = {
    "BoxScoreTraditional": BoxScoreTraditionalV3,
    "PlayByPlay": playbyplayv3.PlayByPlayV3,
    "ShotChartDetail": ShotChartDetail,
    "BoxScorePlayerTrack": BoxScorePlayerTrackV3,
    "BoxScoreAdvanced": BoxScoreAdvancedV3
}

# Determine the season from the game ID
def get_season_from_game_id(game_id):
    year_prefix = game_id[:4]  # Extract the first four characters (e.g., "0022" for 2022)
    return f"{int(year_prefix) + 2000}"  # Adjust to the format (e.g., "2022")

# Function to fetch data with retries
def fetch_data_with_retries(fetch_function, game_id, **kwargs):
    for attempt in range(3):  # Number of retries
        try:
            data = fetch_function(game_id=game_id, **kwargs).get_data_frames()[0]
            return data
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for game {game_id}: {e}")
            sleep(1)  # Delay between attempts
    print(f"Failed after 3 attempts for game {game_id}")
    return pd.DataFrame()

# Update the data files with missing data
def update_data_files(failed_ids):
    for category, game_ids in failed_ids.items():
        file_name = {
            "BoxScoreTraditional": "boxscoretraditional_data.csv",
            "PlayByPlay": "play_by_play.csv",
            "ShotChartDetail": "shotchart_data.csv",
            "BoxScorePlayerTrack": "boxscoreplayertrack_data.csv",
            "BoxScoreAdvanced": "advanced_box_data.csv"
        }.get(category, None)

        if not file_name:
            print(f"Unknown category: {category}")
            continue

        # Determine the season based on the game IDs
        first_game_id = game_ids[0]  # Use the first game ID to determine the season
        season = get_season_from_game_id(first_game_id)
        file_path = f"{season}_{file_name}"

        # Load existing data if the file exists
        existing_df = pd.read_csv(file_path) if os.path.exists(file_path) else pd.DataFrame()

        for game_id in game_ids:
            print(f"Fetching data for {category} game ID {game_id}")
            if category == "BoxScoreTraditional":
                new_data = fetch_data_with_retries(fetch_functions[category], game_id)
            elif category == "PlayByPlay":
                new_data = fetch_data_with_retries(fetch_functions[category], game_id)
            elif category == "ShotChartDetail":
                new_data = fetch_data_with_retries(fetch_functions[category], game_id, team_id=0, player_id=0, context_measure_simple='FGA')
            elif category == "BoxScorePlayerTrack":
                new_data = fetch_data_with_retries(fetch_functions[category], game_id)
            elif category == "BoxScoreAdvanced":
                new_data = fetch_data_with_retries(fetch_functions[category], game_id)
            else:
                print(f"Unknown category: {category}")
                continue

            if not new_data.empty:
                # Append new data to the existing DataFrame
                existing_df = pd.concat([existing_df, new_data], ignore_index=True)
                print(f"Appended data for game ID {game_id}")
            else:
                print(f"No data fetched for game ID {game_id}")

            # Optional: Avoid hitting rate limits
            sleep(1)

        # Save the updated DataFrame back to CSV
        existing_df.to_csv(file_path, index=False)
        print(f"Updated file saved: {file_path}")

update_data_files(failed_ids)


Fetching data for BoxScoreTraditional game ID 0021500241
Appended data for game ID 0021500241
Fetching data for BoxScoreTraditional game ID 0021700370
Appended data for game ID 0021700370
Updated file saved: 2021_boxscoretraditional_data.csv
Fetching data for PlayByPlay game ID 0022100301
Appended data for game ID 0022100301
Fetching data for PlayByPlay game ID 0022100506
Appended data for game ID 0022100506
Fetching data for PlayByPlay game ID 0022200707
Appended data for game ID 0022200707
Fetching data for PlayByPlay game ID 0021500192
Appended data for game ID 0021500192
Fetching data for PlayByPlay game ID 0021500242
Appended data for game ID 0021500242
Fetching data for PlayByPlay game ID 0021700079
Appended data for game ID 0021700079
Fetching data for PlayByPlay game ID 0021700319
Appended data for game ID 0021700319
Fetching data for PlayByPlay game ID 0021700370
Appended data for game ID 0021700370
Fetching data for PlayByPlay game ID 0021800264
Appended data for game ID 0021