In [34]:
!pip install nba_api



In [35]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import playbyplayv3
import time

In [36]:
#Retrieving all game_ids from the past three seasons

seasons = ["2022-23", "2023-24", "2024-25"] # format from parameters doc

all_games = []

for season in seasons:
    gamefinder = leaguegamefinder.LeagueGameFinder(
        season_nullable=season,
        season_type_nullable="Regular Season",
        league_id_nullable="00"
    )
    
    game_id_df = gamefinder.get_data_frames()[0]
    game_id_df = game_id_df.drop_duplicates(subset=["GAME_ID"])
    game_id_df["SEASON"] = season
    
    all_games.append(game_id_df)
game_id_df = pd.concat(all_games, ignore_index=True)
game_id_df['GAME_ID'].duplicated().sum()
game_id_df.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,SEASON
0,22022,1610612744,GSW,Golden State Warriors,22201230,2023-04-09,GSW @ POR,W,240,157,58,96,0.604,27,49,0.551,14,16,0.875,9,49,58,47,13,6,15,18,56.0,2022-23
1,22022,1610612748,MIA,Miami Heat,22201219,2023-04-09,MIA vs. ORL,W,241,123,45,83,0.542,18,44,0.409,15,20,0.75,7,37,44,30,10,3,18,20,13.0,2022-23
2,22022,1610612764,WAS,Washington Wizards,22201222,2023-04-09,WAS vs. HOU,L,240,109,41,97,0.423,14,45,0.311,13,21,0.619,14,37,51,28,11,5,12,21,-5.0,2022-23
3,22022,1610612749,MIL,Milwaukee Bucks,22201221,2023-04-09,MIL @ TOR,L,240,105,38,82,0.463,12,34,0.353,17,21,0.81,7,31,38,28,5,2,14,13,-16.0,2022-23
4,22022,1610612762,UTA,Utah Jazz,22201228,2023-04-09,UTA @ LAL,L,240,117,51,101,0.505,10,35,0.286,5,6,0.833,7,38,45,31,4,4,11,16,-11.0,2022-23


In [98]:
from nba_api.stats.library.http import NBAStatsHTTP

# --- Make nba_api look like a browser (helps reduce slow/blocked responses) ---
NBAStatsHTTP.headers.update({
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    "Referer": "https://stats.nba.com/",
    "Origin": "https://stats.nba.com",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
})


NBAStatsHTTP.timeout = 120

season_map = dict(zip(game_id_df["GAME_ID"], game_id_df["SEASON"]))

pbp_dfs = []
fail_streak = 0

for gid in game_id_df["GAME_ID"]:
    retries = 0

    while retries < 6:
        try:
            df_pbp = playbyplayv3.PlayByPlayV3(game_id=gid).get_data_frames()[0]
            df_pbp["SEASON"] = season_map.get(gid)
            pbp_dfs.append(df_pbp)

            fail_streak = 0
            time.sleep(random.uniform(1.2, 2.2))  # human-like pacing
            break

        except Exception as e:
            retries += 1
            fail_streak += 1
            print(f"Error for {gid} attempt {retries}: {type(e).__name__}: {e}")

            # exponential backoff + jitter
            time.sleep((2 ** retries) + random.uniform(0, 1.5))

            # if lots of failures in a row, cool down hard
            if fail_streak >= 8:
                print("Failure streak detected — cooling down for 60–90s...")
                time.sleep(random.uniform(60, 90))
                fail_streak = 0

pbp_dfs = pd.concat(pbp_dfs, ignore_index=True) if pbp_dfs else pd.DataFrame()

Error for 0022200634 attempt 1: ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Error for 0022200634 attempt 2: ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Error for 0022200634 attempt 3: ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Error for 0022200634 attempt 4: ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Error for 0022200634 attempt 5: ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Error for 0022200634 attempt 6: ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Error for 0022200633 attempt 1: ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Error for 0022200633 attempt 2: ReadTimeout: HTTPSConnectionPool(host='stats

In [99]:
pbp_dfs.loc[pbp_dfs['gameId'] == '0022400606'].head()
print(pbp_dfs['gameId'].nunique())
print(game_id_df['GAME_ID'].nunique())

unfetched_games = list(set(game_id_df['GAME_ID'].unique()) - set(pbp_dfs['gameId'].unique()))
print(len(unfetched_games))

3630
3690
60


In [100]:
game_id_df["GAME_ID"]

0       0022201230
1       0022201219
2       0022201222
3       0022201221
4       0022201228
           ...    
3685    0022400067
3686    0022400071
3687    0022400064
3688    0022400062
3689    0022400061
Name: GAME_ID, Length: 3690, dtype: object

In [96]:
pbp_dfs.head()

Unnamed: 0,gameId,actionNumber,clock,period,teamId,teamTricode,personId,playerName,playerNameI,xLegacy,yLegacy,shotDistance,shotResult,isFieldGoal,scoreHome,scoreAway,pointsTotal,location,description,actionType,subType,videoAvailable,shotValue,actionId,SEASON


In [103]:
pbp_dfs.to_csv('playbyplay.csv', index=False, compression='zip')

In [105]:
game_id_df.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,SEASON
0,22022,1610612744,GSW,Golden State Warriors,22201230,2023-04-09,GSW @ POR,W,240,157,58,96,0.604,27,49,0.551,14,16,0.875,9,49,58,47,13,6,15,18,56.0,2022-23
1,22022,1610612748,MIA,Miami Heat,22201219,2023-04-09,MIA vs. ORL,W,241,123,45,83,0.542,18,44,0.409,15,20,0.75,7,37,44,30,10,3,18,20,13.0,2022-23
2,22022,1610612764,WAS,Washington Wizards,22201222,2023-04-09,WAS vs. HOU,L,240,109,41,97,0.423,14,45,0.311,13,21,0.619,14,37,51,28,11,5,12,21,-5.0,2022-23
3,22022,1610612749,MIL,Milwaukee Bucks,22201221,2023-04-09,MIL @ TOR,L,240,105,38,82,0.463,12,34,0.353,17,21,0.81,7,31,38,28,5,2,14,13,-16.0,2022-23
4,22022,1610612762,UTA,Utah Jazz,22201228,2023-04-09,UTA @ LAL,L,240,117,51,101,0.505,10,35,0.286,5,6,0.833,7,38,45,31,4,4,11,16,-11.0,2022-23


In [116]:
pbp_dfs.head(10)

Unnamed: 0,gameId,actionNumber,clock,period,teamId,teamTricode,personId,playerName,playerNameI,xLegacy,yLegacy,shotDistance,shotResult,isFieldGoal,scoreHome,scoreAway,pointsTotal,location,description,actionType,subType,videoAvailable,shotValue,actionId,SEASON
0,22201230,2,PT12M00.00S,1,0,,0,,,0,0,0,,0,0.0,0.0,0,,Start of 1st Period (3:41 PM EST),period,start,0,0,1,2022-23
1,22201230,4,PT12M00.00S,1,1610612757,POR,1628995,Knox II,K. Knox II,0,0,0,,0,,,0,h,Jump Ball Knox II vs. Looney: Tip to D. Green,Jump Ball,,1,0,2,2022-23
2,22201230,8,PT11M40.00S,1,1610612744,GSW,202691,Thompson,K. Thompson,94,231,25,Made,1,0.0,3.0,3,v,Thompson 25' 3PT Jump Shot (3 PTS) (Curry 1 AST),Made Shot,Jump Shot,1,3,3,2022-23
3,22201230,10,PT11M15.00S,1,1610612757,POR,1631101,Sharpe,S. Sharpe,-27,201,20,Made,1,2.0,3.0,5,h,Sharpe 20' Pullup Jump Shot (2 PTS) (Watford 1...,Made Shot,Pullup Jump shot,1,2,4,2022-23
4,22201230,11,PT11M08.00S,1,1610612744,GSW,202691,Thompson,K. Thompson,213,125,25,Made,1,2.0,6.0,8,v,Thompson 25' 3PT Jump Shot (6 PTS) (Curry 2 AST),Made Shot,Jump Shot,1,3,5,2022-23
5,22201230,96,PT10M50.00S,1,1610612757,POR,1630570,Watford,T. Watford,17,23,3,Missed,1,,,0,h,MISS Watford 3' Driving Layup,Missed Shot,Driving Layup Shot,1,2,6,2022-23
6,22201230,97,PT10M50.00S,1,1610612744,GSW,201939,Curry,S. Curry,0,0,0,,0,,,0,v,Curry REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,7,2022-23
7,22201230,15,PT10M47.00S,1,1610612744,GSW,202691,Thompson,K. Thompson,-173,205,27,Missed,1,,,0,v,MISS Thompson 27' 3PT Running Jump Shot,Missed Shot,Running Jump Shot,1,3,8,2022-23
8,22201230,16,PT10M44.00S,1,1610612757,POR,1631101,Sharpe,S. Sharpe,0,0,0,,0,,,0,h,Sharpe REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,9,2022-23
9,22201230,17,PT10M40.00S,1,1610612757,POR,1631101,Sharpe,S. Sharpe,9,298,30,Missed,1,,,0,h,MISS Sharpe 30' 3PT Running Pull-Up Jump Shot,Missed Shot,Running Pull-Up Jump Shot,1,3,10,2022-23


In [141]:
## Basic data processing

pbp_dfs.dtypes

# Issue - API returned unsorted rows
import re

def clock_transform(clock):
    clock = str(clock)
    m = re.search(r'PT(\d+)M([\d\.]+)S', clock)
    minutes = int(m.group(1))
    seconds = float(m.group(2))
    return minutes * 60 + seconds

pbp_dfs["period_seconds_remaining"] = pbp_dfs["clock"].apply(clock_transform)
pbp_dfs["period_len"] = np.where(pbp_dfs["period"] <= 4, 12 * 60, 5 * 60)
pbp_dfs["sec_elapsed_in_period"] = pbp_dfs["period_len"] - pbp_dfs["period_seconds_remaining"]
pbp_dfs["sec_elapsed_game"] = (
    np.where(pbp_dfs["period"] <= 4,
             (pbp_dfs["period"] - 1) * 12 * 60,
             4 * 12 * 60 + (pbp_dfs["period"] - 5) * 5 * 60)
    + pbp_dfs["sec_elapsed_in_period"]
)

# Removing unnecessary features
df = pbp_dfs.sort_values(['SEASON','gameId','period','sec_elapsed_game'], ascending=[True,True,True,True]).reset_index(drop=True)
df = df.drop(columns=['seconds_remaining', 'period_len','playerNameI','videoAvailable','actionNumber','description']) #actionId is more suitable
df.head()


Unnamed: 0,gameId,clock,period,teamId,teamTricode,personId,playerName,xLegacy,yLegacy,shotDistance,shotResult,isFieldGoal,scoreHome,scoreAway,pointsTotal,location,actionType,subType,shotValue,actionId,SEASON,period_seconds_remaining,sec_elapsed_in_period,sec_elapsed_game
0,22200001,PT12M00.00S,1,0,,0,,0,0,0,,0,0.0,0.0,0,,period,start,0,1,2022-23,720.0,0.0,0.0
1,22200001,PT12M00.00S,1,1610612738,BOS,201143,Horford,0,0,0,,0,,,0,h,Jump Ball,,0,2,2022-23,720.0,0.0,0.0
2,22200001,PT11M38.00S,1,1610612755,PHI,203954,Embiid,-118,50,13,Missed,1,,,0,v,Missed Shot,Turnaround Fadeaway shot,2,3,2022-23,698.0,22.0,22.0
3,22200001,PT11M38.00S,1,1610612738,BOS,1627759,Brown,0,0,0,,0,,,0,h,,,2,4,2022-23,698.0,22.0,22.0
4,22200001,PT11M35.00S,1,1610612755,PHI,200782,Tucker,0,0,0,,0,,,0,v,Rebound,Unknown,0,5,2022-23,695.0,25.0,25.0


In [142]:
# Checking all features that should have standardized values
print(f"All possible period values: {df['period'].unique()}")
# 5th and 6th periods refer to OT1 and OT2
print(f"All possible shotResult values: {df['shotResult'].unique()}")
print(f"All possible actionType values: {df['actionType'].unique()}")
print(f"All possible subType values: {df['subType'].unique()}")

All possible period values: [1 2 3 4 5 6]
All possible shotResult values: ['' 'Missed' 'Made']
All possible actionType values: ['period' 'Jump Ball' 'Missed Shot' '' 'Rebound' 'Turnover' 'Made Shot'
 'Foul' 'Free Throw' 'Instant Replay' 'Substitution' 'Timeout' 'Violation'
 'Ejection']
All possible subType values: ['start' '' 'Turnaround Fadeaway shot' 'Unknown'
 'Out of Bounds Lost Ball Turnover' 'Driving Floating Bank Jump Shot'
 'Driving Floating Jump Shot' 'Tip Layup Shot' 'Jump Shot' 'Shooting'
 'Free Throw 1 of 2' 'Free Throw 2 of 2' 'Driving Layup Shot'
 'Fadeaway Jump Shot' 'Running Layup Shot' 'Personal' 'Cutting Layup Shot'
 'Lost Ball' 'Offensive Charge' 'Offensive Foul Turnover'
 'Step Back Jump shot' 'Normal Rebound' 'Flagrant Type 1'
 'Overturn Ruling' 'Free Throw Flagrant 1 of 3'
 'Free Throw Flagrant 2 of 3' 'Free Throw Flagrant 3 of 3' 'Bad Pass'
 'Loose Ball' 'Regular' 'Traveling' 'Transition Take' 'Free Throw 1 of 1'
 'Driving Reverse Layup Shot' 'Running Jump Shot' 

In [143]:
df.to_csv('playbyplayfinal.csv', index=False, compression='zip')
game_id_df.to_csv('game_id.csv', index=False, compression='zip')