In [2]:
!pip install nba_api


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import playbyplayv3
import time

In [4]:
#Retrieving all game_ids from the past three seasons

seasons = ["2022-23", "2023-24", "2024-25"] # format from parameters doc

all_games = []

for season in seasons:
    gamefinder = leaguegamefinder.LeagueGameFinder(
        season_nullable=season,
        season_type_nullable="Regular Season",
        league_id_nullable="00"
    )
    
    game_id_df = gamefinder.get_data_frames()[0]
    game_id_df = game_id_df.drop_duplicates(subset=["GAME_ID"])
    game_id_df["SEASON"] = season
    
    all_games.append(game_id_df)
game_id_df = pd.concat(all_games, ignore_index=True)
game_id_df['GAME_ID'].duplicated().sum()
game_id_df.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,SEASON
0,22022,1610612739,CLE,Cleveland Cavaliers,22201218,2023-04-09,CLE vs. CHA,L,241,95,...,11,33,44,27,9,5,16,24,-11.0,2022-23
1,22022,1610612763,MEM,Memphis Grizzlies,22201226,2023-04-09,MEM @ OKC,L,241,100,...,11,32,43,25,8,4,12,16,-15.0,2022-23
2,22022,1610612743,DEN,Denver Nuggets,22201227,2023-04-09,DEN vs. SAC,W,240,109,...,15,36,51,25,11,2,16,15,14.0,2022-23
3,22022,1610612748,MIA,Miami Heat,22201219,2023-04-09,MIA vs. ORL,W,241,123,...,7,37,44,30,10,3,18,20,13.0,2022-23
4,22022,1610612752,NYK,New York Knicks,22201220,2023-04-09,NYK vs. IND,L,241,136,...,19,34,53,29,8,8,15,24,-5.0,2022-23


In [None]:
from nba_api.stats.library.http import NBAStatsHTTP
import random

# --- Make nba_api look like a browser (helps reduce slow/blocked responses) ---
NBAStatsHTTP.headers.update({
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    "Referer": "https://stats.nba.com/",
    "Origin": "https://stats.nba.com",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
})


NBAStatsHTTP.timeout = 120

season_map = dict(zip(game_id_df["GAME_ID"], game_id_df["SEASON"]))

pbp_dfs = []
fail_streak = 0
progress = 0
total_num = len(game_id_df)
for i,gid in enumerate(game_id_df["GAME_ID"]):
    retries = 0

    if (i%20 == 0 ):
        progress = i/total_num * 100
        print(f"Current Progress: {progress}")

    while retries < 2:
        try:
            df_pbp = playbyplayv3.PlayByPlayV3(game_id=gid).get_data_frames()[0]
            df_pbp["SEASON"] = season_map.get(gid)
            pbp_dfs.append(df_pbp)

            fail_streak = 0
            time.sleep(random.uniform(1.2, 2.2))  # human-like pacing
            break

        except Exception as e:
            retries += 1
            fail_streak += 1
            print(f"Error for {gid} attempt {retries}: {type(e).__name__}: {e}")

            # exponential backoff + jitter
            time.sleep((2 ** retries) + random.uniform(0, 1.5))

            # if lots of failures in a row, cool down hard
            if fail_streak >= 8:
                print("Failure streak detected — cooling down for 10–20s...")
                time.sleep(random.uniform(10, 20))
                fail_streak = 0

pbp_dfs = pd.concat(pbp_dfs, ignore_index=True) if pbp_dfs else pd.DataFrame()

In [10]:
pbp_dfs.loc[pbp_dfs['gameId'] == '0022400606'].head()
print(pbp_dfs['gameId'].nunique())
print(game_id_df['GAME_ID'].nunique())

unfetched_games = list(set(game_id_df['GAME_ID'].unique()) - set(pbp_dfs['gameId'].unique()))
print(len(unfetched_games))

3463
3690
227


In [11]:
game_id_df["GAME_ID"]

0       0022201218
1       0022201226
2       0022201227
3       0022201219
4       0022201220
           ...    
3685    0022400066
3686    0022400067
3687    0022400064
3688    0022400061
3689    0022400062
Name: GAME_ID, Length: 3690, dtype: str

In [12]:
pbp_dfs.head()

Unnamed: 0,gameId,actionNumber,clock,period,teamId,teamTricode,personId,playerName,playerNameI,xLegacy,...,scoreAway,pointsTotal,location,description,actionType,subType,videoAvailable,shotValue,actionId,SEASON
0,22201218,2,PT12M00.00S,1,0,,0,,,0,...,0.0,0,,Start of 1st Period (1:11 PM EST),period,start,0,0,1,2022-23
1,22201218,4,PT12M00.00S,1,1610612739,CLE,1628386,Allen,J. Allen,0,...,,0,h,Jump Ball Allen vs. Williams: Tip to Maledon,Jump Ball,,1,0,2,2022-23
2,22201218,7,PT11M39.00S,1,1610612766,CHA,1631121,McGowens,B. McGowens,233,...,,0,v,MISS McGowens 3PT Jump Shot,Missed Shot,Jump Shot,1,3,3,2022-23
3,22201218,8,PT11M36.00S,1,1610612739,CLE,1628386,Allen,J. Allen,0,...,,0,h,Allen REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,4,2022-23
4,22201218,9,PT11M33.00S,1,1610612739,CLE,201937,Rubio,R. Rubio,0,...,,0,h,Rubio Out of Bounds - Bad Pass Turnover Turnov...,Turnover,Out of Bounds - Bad Pass Turnover,1,0,5,2022-23


In [13]:
pbp_dfs.to_csv('playbyplaydata.csv', index=False, compression='zip')

In [15]:
pbp_dfs.to_csv('playbyplaydataraw.csv', index=False)

In [14]:
game_id_df.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,SEASON
0,22022,1610612739,CLE,Cleveland Cavaliers,22201218,2023-04-09,CLE vs. CHA,L,241,95,...,11,33,44,27,9,5,16,24,-11.0,2022-23
1,22022,1610612763,MEM,Memphis Grizzlies,22201226,2023-04-09,MEM @ OKC,L,241,100,...,11,32,43,25,8,4,12,16,-15.0,2022-23
2,22022,1610612743,DEN,Denver Nuggets,22201227,2023-04-09,DEN vs. SAC,W,240,109,...,15,36,51,25,11,2,16,15,14.0,2022-23
3,22022,1610612748,MIA,Miami Heat,22201219,2023-04-09,MIA vs. ORL,W,241,123,...,7,37,44,30,10,3,18,20,13.0,2022-23
4,22022,1610612752,NYK,New York Knicks,22201220,2023-04-09,NYK vs. IND,L,241,136,...,19,34,53,29,8,8,15,24,-5.0,2022-23


In [16]:
game_id_df.to_csv('game_id_raw.csv', index=False)

In [None]:
pbp_dfs.head(10)

In [None]:
# Checking all features that should have standardized values
print(f"All possible period values: {df['period'].unique()}")
# 5th and 6th periods refer to OT1 and OT2
print(f"All possible shotResult values: {df['shotResult'].unique()}")
print(f"All possible actionType values: {df['actionType'].unique()}")
print(f"All possible subType values: {df['subType'].unique()}")

In [143]:
df.to_csv('playbyplayfinal.csv', index=False, compression='zip')
game_id_df.to_csv('game_id.csv', index=False, compression='zip')