In [10]:
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
from nba_api.stats.endpoints import playbyplayv3
from nba_api.stats.endpoints import BoxScoreTraditionalV2
import time
import random
from requests.exceptions import ReadTimeout, ConnectionError, RequestException, Timeout
from json.decoder import JSONDecodeError
import re

def is_int_convertible(x):
    try:
        int(x)
        return True
    except (ValueError, TypeError):
        return False

def get_minutes_from_v3_clock(clock_str):
    if not isinstance(clock_str, str): return 0

    # Check for PTxxM format
    match = re.search(r'PT(\d+)M', clock_str)
    if match:
        return int(match.group(1))

    # Check for MM:SS format
    if ':' in clock_str:
        return int(clock_str.split(':')[0])

    return 0

# It will keep trying until it succeeds or you manually stop it.
def robust_fetch(endpoint_class, **kwargs):
    max_retries = 20  # High number, effectively infinite for short hiccups
    base_delay = 2    # Start waiting 2 seconds

    for attempt in range(max_retries):
        try:
            # Attempt the API call
            endpoint = endpoint_class(**kwargs)
            # Force a data access to ensure the request actually finished
            return endpoint.get_data_frames()[0]

        except (ReadTimeout, ConnectionError, Timeout, RequestException, JSONDecodeError) as e:
            # If any network/server error occurs:
            print(f"  [!] API Error on attempt {attempt+1}: {e}")
            print(f"  [i] Retrying in {base_delay} seconds...")

            time.sleep(base_delay)

            # Exponential Backoff: Wait longer next time (2s -> 4s -> 8s...)
            # Cap at 60 seconds to avoid waiting forever
            base_delay = min(base_delay * 2, 60)

        except Exception as e:
            # Catch unexpected Python errors (like syntax inside the API lib)
            print(f"  [!!!] Critical Unknown Error: {e}")
            break # Break so you don't infinite loop on a code bug

    return None # Return None if we completely failed after 20 tries

In [11]:
##.............data structures to be used.........##

#list of dictionary to append scraped data progressively
df_list=[] #every dict in this list represents a game

df= pd.DataFrame(columns=['GAME_ID',
                          'TEAM1',
                          'TEAM2', 'DATE', 'PF', 'CLUTCH', 'HIGHLIGHTS', 'PERFORMANCE'])

In [12]:
##.............Get data on Regular season games.........##
gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable='2023-24',
                                              league_id_nullable='00',
                                              season_type_nullable='Regular Season')
games = gamefinder.get_data_frames()[0]
game_ids = games['GAME_ID'].unique().tolist() # get a list of distinct game_ids

In [13]:
maxClutch = 0
maxHighlight = 0
maxPerformance = 0

print(f"Starting ingestion for {len(game_ids)} games...")

for i, game in enumerate(game_ids):
    print(f"Processing Game {i+1}/{len(game_ids)}: ID {game}") # Progress indicator

    game_dict = {}

    try:
        game_dict['GAME_ID']=game
        game_dict['TEAM1']= str(games[games["GAME_ID"]== game].iloc[0]["TEAM_ABBREVIATION"])
        game_dict['TEAM2']= str(games[games["GAME_ID"]== game].iloc[1]["TEAM_ABBREVIATION"])
        game_dict['DATE']= str(games[games["GAME_ID"]== game].iloc[0]["GAME_DATE"])
        game_dict['PF']= games[games["GAME_ID"]== game].iloc[0]["PF"] + games[games["GAME_ID"]== game].iloc[1]["PF"]

        #Fetch PlayByPlay
        pbp = robust_fetch(playbyplayv3.PlayByPlayV3, game_id=game)
        pbp = pbp.get_data_frames()[0]

        if pbp is None:
            print(f"  [x] Failed to fetch PBP for {game}. Skipping.")
            continue

        # --- A. Clutch Calculation ---
        period = pbp['PERIOD'].unique().max()
        if period == 5:
            game_dict['CLUTCH'] = 0.95
        elif period >= 6:
            game_dict['CLUTCH'] = 1.0
        else:
            pbp_valid = pbp[
                (pbp["period"] >= 4) &
                (pbp["PCTIMESTRING"].str.contains(":")) & # Safety check
                (pbp['SCOREMARGIN'].notna()) &
                (pbp['SCOREMARGIN'].apply(is_int_convertible))
            ].copy() # Copy to avoid SettingWithCopy warnings

            # Filter for last 5 mins and clutch situations
            pbp_valid = pbp_valid[pbp_valid["PCTIMESTRING"].str.split(":").str[0].astype(int) < 5]
            clutch_mask = (pbp_valid["SCOREMARGIN"].astype(int) <= 5) & (pbp_valid["SCOREMARGIN"].astype(int) >= -5)
            clutch_count = clutch_mask.sum()

            game_dict['CLUTCH'] = clutch_count
            if clutch_count > maxClutch:
                maxClutch = clutch_count

        # --- B. Performance Calculation (ROBUST) ---
        stats_df = robust_fetch(BoxScoreTraditionalV2, game_id=game)
        stats_df = stats_df.player_stats.get_data_frame()

        if stats_df is None:
            print(f"  [x] Failed to fetch BoxScore for {game}. Skipping.")
            continue

        stats_df['GAME_SCORE'] = (
            stats_df['PTS'] + 0.4 * stats_df['FGM'] - 0.7 * stats_df['FGA'] -
            0.4 * (stats_df['FTA'] - stats_df['FTM']) + 0.7 * stats_df['OREB'] +
            0.3 * stats_df['DREB'] + stats_df['STL'] + 0.7 * stats_df['AST'] +
            0.7 * stats_df['BLK'] - 0.4 * stats_df['PF'] - stats_df['TOV']
        )

        performance_sum = 0.0
        for gs in stats_df['GAME_SCORE']:
            if gs >= 70.0: performance_sum += 0.95
            elif gs >= 60.0: performance_sum += 0.9
            elif gs >= 50.0: performance_sum += 0.8
            elif gs >= 40.0: performance_sum += 0.7

        game_dict['PERFORMANCE'] = performance_sum
        if performance_sum > maxPerformance:
            maxPerformance = performance_sum

        # --- C. Highlights Calculation ---
        dunks = pbp[
            ((pbp["HOMEDESCRIPTION"].str.contains("Dunk", case=False, na=False)) & (~pbp["HOMEDESCRIPTION"].str.contains("MISS", case=False, na=False))) |
            ((pbp["VISITORDESCRIPTION"].str.contains("Dunk", case=False, na=False)) & (~pbp["VISITORDESCRIPTION"].str.contains("MISS", case=False, na=False)))
        ].shape[0]

        # Fix for threes: ensure we grab values safely
        threes= int(games[games["GAME_ID"]== game].iloc[0]["FG3M"] + games[games["GAME_ID"]== game].iloc[1]["FG3M"])

        blocks = pbp[
            pbp["HOMEDESCRIPTION"].str.contains("BLOCK", case=False, na=False) |
            pbp["VISITORDESCRIPTION"].str.contains("BLOCK", case=False, na=False)
        ].shape[0]

        #calculate highlights/minutes
        total_highlights = dunks + (0.75 * threes) + blocks
        highlights_density = total_highlights / 48

        game_dict['HIGHLIGHTS'] = highlights_density
        if highlights_density > maxHighlight:
            maxHighlight = highlights_density

        df_list.append(game_dict)

        # Standard sleep
        time.sleep(random.uniform(0.5, 1.0))

    except Exception as e:
        print(f"  [!!!] Critical Error processing game {game}: {e}")
        # We continue to the next game even if this one crashes logic
        continue

Starting ingestion for 1230 games...
Processing Game 1/1230: ID 0022301192
  [!!!] Critical Unknown Error: 'resultSet'
  [x] Failed to fetch PBP for 0022301192. Skipping.
Processing Game 2/1230: ID 0022301197
  [!!!] Critical Unknown Error: 'resultSet'
  [x] Failed to fetch PBP for 0022301197. Skipping.
Processing Game 3/1230: ID 0022301190
  [!!!] Critical Unknown Error: 'resultSet'
  [x] Failed to fetch PBP for 0022301190. Skipping.
Processing Game 4/1230: ID 0022301194
  [!!!] Critical Unknown Error: 'resultSet'
  [x] Failed to fetch PBP for 0022301194. Skipping.
Processing Game 5/1230: ID 0022301199
  [!!!] Critical Unknown Error: 'resultSet'
  [x] Failed to fetch PBP for 0022301199. Skipping.
Processing Game 6/1230: ID 0022301186
  [!!!] Critical Unknown Error: 'resultSet'
  [x] Failed to fetch PBP for 0022301186. Skipping.
Processing Game 7/1230: ID 0022301196
  [!!!] Critical Unknown Error: 'resultSet'
  [x] Failed to fetch PBP for 0022301196. Skipping.
Processing Game 8/1230: I

KeyboardInterrupt: 