In [3]:
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
from nba_api.stats.endpoints import playbyplayv3
from nba_api.stats.endpoints import BoxScoreTraditionalV3
import time
import random
from requests.exceptions import ReadTimeout, ConnectionError, RequestException, Timeout
from json.decoder import JSONDecodeError
import re

def is_int_convertible(x):
    try:
        int(x)
        return True
    except (ValueError, TypeError):
        return False

def get_minutes_from_v3_clock(clock_str):
    if not isinstance(clock_str, str): return 0

    # Check for PTxxM format
    match = re.search(r'PT(\d+)M', clock_str)
    if match:
        return int(match.group(1))

    # Check for MM:SS format
    if ':' in clock_str:
        return int(clock_str.split(':')[0])

    return 0

# It will keep trying until it succeeds or you manually stop it.
def robust_fetch(endpoint_class, **kwargs):
    max_retries = 20  # High number, effectively infinite for short hiccups
    base_delay = 2    # Start waiting 2 seconds

    for attempt in range(max_retries):
        try:
            # Attempt the API call
            endpoint = endpoint_class(**kwargs)
            # Force a data access to ensure the request actually finished
            return endpoint.get_data_frames()[0]

        except (ReadTimeout, ConnectionError, Timeout, RequestException, JSONDecodeError) as e:
            # If any network/server error occurs:
            print(f"  [!] API Error on attempt {attempt+1}: {e}")
            print(f"  [i] Retrying in {base_delay} seconds...")

            time.sleep(base_delay)

            # Exponential Backoff: Wait longer next time (2s -> 4s -> 8s...)
            # Cap at 60 seconds to avoid waiting forever
            base_delay = min(base_delay * 2, 60)

        except Exception as e:
            # Catch unexpected Python errors (like syntax inside the API lib)
            print(f"  [!!!] Critical Unknown Error: {e}")
            break # Break so you don't infinite loop on a code bug

    return None # Return None if we completely failed after 20 tries

In [4]:
##.............data structures to be used.........##

#list of dictionary to append scraped data progressively
df_list=[] #every dict in this list represents a game

df= pd.DataFrame(columns=['GAME_ID',
                          'TEAM1',
                          'TEAM2', 'DATE', 'PF', 'CLUTCH', 'HIGHLIGHTS', 'PERFORMANCE'])

In [5]:
##.............Get data on Regular season games.........##
gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable='2023-24',
                                              league_id_nullable='00',
                                              season_type_nullable='Regular Season')
games = gamefinder.get_data_frames()[0]
game_ids = games['GAME_ID'].unique().tolist() # get a list of distinct game_ids

In [None]:
print(f"Starting ingestion for {len(game_ids)} games...")

for i, game in enumerate(game_ids[962:], start=962):
    print(f"Processing Game {i + 1}/{len(game_ids)}: ID {game}")  # Progress indicator

    game_dict = {}

    try:
        game_dict['GAME_ID'] = game
        game_dict['TEAM1'] = str(games[games["GAME_ID"] == game].iloc[0]["TEAM_ABBREVIATION"])
        game_dict['TEAM2'] = str(games[games["GAME_ID"] == game].iloc[1]["TEAM_ABBREVIATION"])
        game_dict['DATE'] = str(games[games["GAME_ID"] == game].iloc[0]["GAME_DATE"])
        game_dict['PF'] = games[games["GAME_ID"] == game].iloc[0]["PF"] + games[games["GAME_ID"] == game].iloc[1]["PF"]

        #Fetch PlayByPlay
        pbp = robust_fetch(playbyplayv3.PlayByPlayV3, game_id=game)

        if pbp is None:
            print(f"  [x] Failed to fetch PBP for {game}. Skipping.")
            continue

        # --- A. Clutch Calculation ---
        period = pbp['period'].unique().max()
        if period == 5:
            game_dict['CLUTCH'] = 0.95
        elif period >= 6:
            game_dict['CLUTCH'] = 1.0
        else:
            #           Filter for 4th quarter +
            pbp_valid = pbp[pbp['period'] >= 4].copy()

            # Filter for last 5 minutes using helper (We apply the function to the 'clock' column)
            pbp_valid['MINUTES_REMAINING'] = pbp_valid['clock'].apply(get_minutes_from_v3_clock)
            pbp_valid = pbp_valid[pbp_valid['MINUTES_REMAINING'] < 5]

            # Calculate Margin Manually (Home - Away), Ensure we treat by forward fill if needed
            pbp_valid['scoreHome'] = pd.to_numeric(pbp_valid['scoreHome'], errors='coerce')
            pbp_valid['scoreAway'] = pd.to_numeric(pbp_valid['scoreAway'], errors='coerce')

            pbp_valid[['scoreHome', 'scoreAway']] = pbp_valid[['scoreHome', 'scoreAway']].ffill()
            pbp_valid = pbp_valid.dropna(subset=['scoreHome', 'scoreAway'])

            pbp_valid['margin'] = pbp_valid['scoreHome'] - pbp_valid['scoreAway']

            # Count possessions within +/- 5 points
            clutch_mask = (pbp_valid['margin'] <= 5) & (pbp_valid['margin'] >= -5)
            clutch_count = clutch_mask.sum()

            game_dict['CLUTCH'] = clutch_count

        # --- B. Performance Calculation ---
        stats_df = robust_fetch(BoxScoreTraditionalV3, game_id=game)

        if stats_df is None:
            print(f"  [x] Failed to fetch BoxScore for {game}. Skipping.")
            continue

        stats_df['GAME_SCORE'] = (
                stats_df['points'] + 0.4 * stats_df['fieldGoalsMade'] - 0.7 * stats_df['fieldGoalsAttempted'] -
                0.4 * (stats_df['freeThrowsAttempted'] - stats_df['freeThrowsMade']) + 0.7 * stats_df[
                    'reboundsOffensive'] +
                0.3 * stats_df['reboundsDefensive'] + stats_df['steals'] + 0.7 * stats_df['assists'] +
                0.7 * stats_df['blocks'] - 0.4 * stats_df['foulsPersonal'] - stats_df['turnovers']
        )

        performance_sum = 0.0
        for gs in stats_df['GAME_SCORE']:
            if gs >= 60.0:
                performance_sum += 0.95
            elif gs >= 50.0:
                performance_sum += 0.9
            elif gs >= 40.0:
                performance_sum += 0.8
            elif gs >= 30.0:
                performance_sum += 0.7

        game_dict['PERFORMANCE'] = performance_sum

        # --- C. Highlights Calculation ---
        pbp['description'] = pbp['description'].astype(str).str.upper()

        # Dunks: look for 'DUNK' in the single description column
        # Filter out 'MISS' to be safe, though usually 'Dunk' implies made in text
        if 'shotResult' in pbp.columns:
            dunks = pbp[
                (pbp['description'].str.contains("DUNK", case=False)) &
                (pbp['shotResult'] == 'Made')
                ].shape[0]
        else:
            # Fallback if specific column is missing
            dunks = pbp[
                (pbp['description'].str.contains("DUNK", case=False)) &
                (~pbp['description'].str.contains("MISS", case=False))
                ].shape[0]

        # Fix for threes: ensure we grab values safely
        threes = int(games[games["GAME_ID"] == game].iloc[0]["FG3M"] + games[games["GAME_ID"] == game].iloc[1]["FG3M"])

        blocks = int(games[games["GAME_ID"] == game].iloc[0]["BLK"] + games[games["GAME_ID"] == game].iloc[1]["BLK"])

        #calculate highlights/minutes
        total_highlights = dunks + (0.75 * threes) + blocks
        highlights_density = total_highlights / 48

        game_dict['HIGHLIGHTS'] = highlights_density

        df_list.append(game_dict)

        # Standard sleep
        time.sleep(random.uniform(0.5, 1.0))

    except Exception as e:
        print(f"  [!!!] Critical Error processing game {game}: {e}")
        # We continue to the next game even if this one crashes logic
        continue


Starting ingestion for 1230 games...
Processing Game 963/1230: ID 0022300273


In [25]:
len(df_list)

962

In [9]:
df_list

[{'GAME_ID': '0022301192',
  'TEAM1': 'PHI',
  'TEAM2': 'BKN',
  'DATE': '2024-04-14',
  'PF': 30,
  'CLUTCH': 0,
  'PERFORMANCE': 0.0,
  'HIGHLIGHTS': 0.7135416666666666},
 {'GAME_ID': '0022301197',
  'TEAM1': 'SAS',
  'TEAM2': 'DET',
  'DATE': '2024-04-14',
  'PF': 26,
  'CLUTCH': 0,
  'PERFORMANCE': 0.0,
  'HIGHLIGHTS': 0.6927083333333334},
 {'GAME_ID': '0022301190',
  'TEAM1': 'NYK',
  'TEAM2': 'CHI',
  'DATE': '2024-04-14',
  'PF': 38,
  'CLUTCH': 0.95,
  'PERFORMANCE': 0.0,
  'HIGHLIGHTS': 0.625},
 {'GAME_ID': '0022301194',
  'TEAM1': 'PHX',
  'TEAM2': 'MIN',
  'DATE': '2024-04-14',
  'PF': 44,
  'CLUTCH': 0,
  'PERFORMANCE': 0.7,
  'HIGHLIGHTS': 0.7239583333333334},
 {'GAME_ID': '0022301199',
  'TEAM1': 'LAC',
  'TEAM2': 'HOU',
  'DATE': '2024-04-14',
  'PF': 27,
  'CLUTCH': 27,
  'PERFORMANCE': 0.0,
  'HIGHLIGHTS': 1.0677083333333333},
 {'GAME_ID': '0022301186',
  'TEAM1': 'WAS',
  'TEAM2': 'BOS',
  'DATE': '2024-04-14',
  'PF': 29,
  'CLUTCH': 0,
  'PERFORMANCE': 0.7,
  'HIGHL