In [1]:
import pandas as pd
import os

In [None]:
import re
import time
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.library.http import NBAStatsHTTP

from requests.exceptions import ReadTimeout, ConnectionError, RequestException
from nba_api.stats.library.parameters import SeasonTypeAllStar

custom_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Referer': 'https://www.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}
# --- Config ---
OUTPUT_DIR = "./data"
SEASON = "2024-25"
WAIT_TIME = 0.7  # polite delay between requests (in seconds)
RESUME_FROM = "Precious Achiuwa"  # e.g., "Jaden Ivey" or None to start from the beginning

# --- Helpers ---
def sanitize_filename(name):
    return re.sub(r"[^\w\-]", "_", name)

def fetch_player_gamelogs(player_id, player_name, season):
    try:
        gamelog = playergamelog.PlayerGameLog(player_id=player_id, season=season)
        df = gamelog.get_data_frames()[0]
        df['Player_Name'] = player_name
        return df
    except Exception as e:
        print(f"Error fetching logs for {player_name}: {e}")
        return pd.DataFrame()

def get_weird_named_players():
    """
    Return only those NBA players whose full_name contains:
      - an apostrophe (O'Neal, D'Angelo)
      - a hyphen (Jean-François)
      - a dot (Jr., Sr.)
      - or a suffix Jr, Sr, II, III, IV, etc.
    """
    all_players = players.get_active_players()
    pattern = re.compile(
        r"[\'\-\.]"                # any apostrophe, hyphen, or dot
        r"|(?:\s(?:Jr|Sr|II|III|IV|V|VI))$"  # OR ends with space+Jr/Sr/II/…
        , re.IGNORECASE
    )
    return [p for p in all_players if pattern.search(p["full_name"])]

def sort_players_by_last_name(player_list):
    return sorted(player_list, key=lambda p: p['full_name'].split()[-1].lower())

def sort_playoff_players_by_last_name(player_list):
    return sorted(player_list, key=lambda name: name.split()[-1].lower())


def read_active_players_to_csv():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    all_players = players.get_active_players()
    # all_players = get_weird_named_players()
    sorted_players = sort_players_by_last_name(all_players)

    resume_reached = RESUME_FROM is None

    for idx, player in enumerate(sorted_players):
        full_name = player['full_name']
        resume_last = RESUME_FROM.split()[-1].lower()
        current_last = full_name.split()[-1].lower()

        if not resume_reached:
            if current_last <= resume_last:
                print(f"[{idx}] Skipping {full_name}")
                continue
            else:
                resume_reached = True

        print(f"[{idx}] Fetching: {full_name}")
        df = fetch_player_gamelogs(player['id'], full_name, SEASON)

        if not df.empty:
            filename = os.path.join(OUTPUT_DIR, f"{sanitize_filename(full_name)}.csv")
            df.to_csv(filename, index=False)
            print(f"  → Saved {len(df)} rows to {filename}")
        # else:
            # print(f"  → No games for {full_name}")
           
        time.sleep(WAIT_TIME)

    print("Done.")



def append_playoffs_players():
    all_players = players.get_active_players()
    all_players = get_weird_named_players()
    player_lookup = {p['full_name']: p['id'] for p in all_players}

    existing_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.csv')]
    existing_names = [f.replace('.csv', '').replace('_', ' ') for f in existing_files]
    sorted_names = sort_playoff_players_by_last_name(existing_names)

    resume_reached = RESUME_FROM is None
    if RESUME_FROM:
        resume_last = RESUME_FROM.split()[-1].lower()

    for idx, full_name in enumerate(sorted_names):
        current_last = full_name.split()[-1].lower()

        if not resume_reached:
            if current_last <= resume_last:
                print(f"[{idx}] Skipping {full_name}")
                continue
            else:
                resume_reached = True

        if full_name not in player_lookup:
            print(f"[{idx}] Skipping {full_name} (not found in player list)")
            continue

        player_id = player_lookup[full_name]
        safe_name = sanitize_filename(full_name)
        regular_season_path = os.path.join(OUTPUT_DIR, f"{safe_name}.csv")

        print(f"[{idx}] Fetching playoffs for: {full_name}")

        try:
            regular_df = pd.read_csv(regular_season_path)
        except Exception as e:
            print(f"  ↪ Error reading file for {full_name}: {e}")
            continue

        playoff_df = fetch_player_playoff_gamelogs(player_id, full_name, SEASON)

        if not playoff_df.empty:
            combined_df = pd.concat([regular_df, playoff_df], ignore_index=True)
            combined_df.to_csv(regular_season_path, index=False)
            print(f"  → Appended {len(playoff_df)} playoff games. New total: {len(combined_df)} rows.")
        else:
            print(f"  → No playoff games to append for {full_name}")

        time.sleep(WAIT_TIME)

    print("All Done!")
def fetch_player_playoff_gamelogs(player_id, player_name, season, max_retries=3):
    delay = 1
    for attempt in range(max_retries):
        try:
            NBAStatsHTTP._nba_headers = custom_headers  # monkey-patch headers
            gamelog = playergamelog.PlayerGameLog(
                player_id=player_id,
                season=season,
                season_type_all_star=SeasonTypeAllStar.playoffs
            )
            df = gamelog.get_data_frames()[0]
            df['Player_Name'] = player_name
            return df
        except (ReadTimeout, ConnectionError, RequestException) as e:
            print(f"  ↪ Attempt {attempt + 1} failed for {player_name}: {e}")
            time.sleep(delay)
            delay *= 2
        except Exception as e:
            print(f"  ↪ Unexpected error for {player_name}: {e}")
            break
    return pd.DataFrame()






[0] Skipping Precious Achiuwa
[1] Skipping Steven Adams (not found in player list)
[2] Skipping Bam Adebayo (not found in player list)
[3] Skipping Ochai Agbaji (not found in player list)
[4] Skipping Santi Aldama (not found in player list)
[5] Skipping Trey Alexander (not found in player list)
[6] Fetching playoffs for: Nickeil Alexander-Walker
  → Appended 10 playoff games. New total: 102 rows.
[7] Skipping Grayson Allen (not found in player list)
[8] Skipping Jarrett Allen (not found in player list)
[9] Skipping Jose Alvarado (not found in player list)
[10] Skipping Kyle Anderson (not found in player list)
[11] Skipping Giannis Antetokounmpo (not found in player list)
[12] Skipping Cole Anthony (not found in player list)
[13] Skipping OG Anunoby (not found in player list)
[14] Skipping Deni Avdija (not found in player list)
[15] Skipping Deandre Ayton (not found in player list)
[16] Skipping LaMelo Ball (not found in player list)
[17] Skipping Lonzo Ball (not found in player list)
[

Run the next cell to append playoff data onto existing data in the ./data folder.

Run this next cell to INITIALLY get your data. this is the most time consuming, so I would recommend running it once, then keeping a copy stored so you don't have to run it again.

In [None]:
read_active_players_to_csv()

In [12]:
# append_playoffs_players()

In [10]:
playoff_teams = [
    "Cleveland Cavaliers",
    "Boston Celtics",
    "New York Knicks",
    "Indiana Pacers",
    "Milwaukee Bucks",
    "Detroit Pistons",
    "Orlando Magic",
    "Miami Heat",
    "Oklahoma City Thunder",
    "Houston Rockets",
    "Los Angeles Lakers",
    "Denver Nuggets",
    "Los Angeles Clippers",
    "Minnesota Timberwolves",
    "Golden State Warriors",
    "Memphis Grizzlies"
]

In [4]:
team_code_map = {
    'ATL': 'ATL',
    'BKN': 'BRK',
    'BOS': 'BOS',
    'CHA': 'CHO',
    'CHI': 'CHI',
    'DAL': 'DAL',
    'DEN': 'DEN',
    'DET': 'DET',
    'GSW': 'GSW',
    'HOU': 'HOU',
    'IND': 'IND',
    'LAC': 'LAC',
    'LAL': 'LAL',
    'MEM': 'MEM',
    'MIA': 'MIA',
    'MIL': 'MIL',
    'MIN': 'MIN',
    'NOP': 'NOP',
    'NYK': 'NYK',
    'OKC': 'OKC',
    'ORL': 'ORL',
    'PHI': 'PHI',
    'PHX': 'PHO',
    'POR': 'POR',
    'SAC': 'SAC',
    'SAS': 'SAS',
    'TOR': 'TOR',
    'UTA': 'UTA',
    'WAS': 'WAS'
}




In [5]:
def transform_nba_game_log(path_to_csv):
    df = pd.read_csv(path_to_csv)

    # Parse date
    df['Date'] = pd.to_datetime(df['GAME_DATE']).dt.strftime('%Y-%m-%d')

    # Extract Team and Opponent
    def parse_matchup(matchup):
        team, at_vs, opp = matchup.split()
        return (
            team_code_map.get(team, team),
            '@' if at_vs == '@' else '',
            team_code_map.get(opp, opp)
        )

    parsed = df['MATCHUP'].apply(parse_matchup)
    df['Team'] = parsed.apply(lambda x: x[0])
    df[''] = parsed.apply(lambda x: x[1])
    df['Opp'] = parsed.apply(lambda x: x[2])

    # Result column
    df['Result'] = df['WL'] + ' ' + df['PTS'].astype(str) + '-' + (df.groupby('Player_Name')['PTS'].shift(-1).fillna(df['PTS'])).astype(str)

    # Games Started: unknown, use '*'
    df['GS'] = '*'

    # Minutes
    df['MP'] = df['MIN']

    # Shooting stats
    df['FG'] = df['FGM']
    df['FG%'] = df['FG_PCT']
    df['3P'] = df['FG3M']
    df['3PA'] = df['FG3A']
    df['3P%'] = df['FG3_PCT']

    df['2P'] = df['FGM'] - df['FG3M']
    df['2PA'] = df['FGA'] - df['FG3A']
    df['2P%'] = df['2P'] / df['2PA']
    df['eFG%'] = (df['FGM'] + 0.5 * df['FG3M']) / df['FGA']

    df['FT'] = df['FTM']
    df['FTA'] = df['FTA']
    df['FT%'] = df['FT_PCT']

    df['ORB'] = df['OREB']
    df['DRB'] = df['DREB']
    df['TRB'] = df['REB']

    df['AST'] = df['AST']
    df['STL'] = df['STL']
    df['BLK'] = df['BLK']
    df['TOV'] = df['TOV']
    df['PF'] = df['PF']
    df['PTS'] = df['PTS']

    df['+/-'] = df['PLUS_MINUS']
    
    
    num_cols = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%',
        'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
        'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-'
    ]
    for col in num_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Select final column order
    output_cols = [
        'Date', 'Team', '', 'Opp', 'Result', 'GS', 'MP',
        'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%',
        'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
        'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-'
    ]
    df.sort_values(by='Date', inplace=True)
    df_out = df[output_cols]

    return df_out


In [6]:
filepaths_and_players = [(os.path.join('./data', f), f.replace('.csv', '')) 
                         for f in os.listdir('./data') if f.endswith('.csv')]

In [7]:
print(filepaths_and_players)

[('./data/Jalen_Suggs.csv', 'Jalen_Suggs'), ('./data/Sam_Merrill.csv', 'Sam_Merrill'), ('./data/Tobias_Harris.csv', 'Tobias_Harris'), ('./data/Markieff_Morris.csv', 'Markieff_Morris'), ('./data/Pete_Nance.csv', 'Pete_Nance'), ('./data/Jaylen_Clark.csv', 'Jaylen_Clark'), ('./data/Collin_Gillespie.csv', 'Collin_Gillespie'), ('./data/De_Aaron_Fox.csv', 'De_Aaron_Fox'), ('./data/PJ_Hall.csv', 'PJ_Hall'), ('./data/Kyle_Filipowski.csv', 'Kyle_Filipowski'), ('./data/Malevy_Leons.csv', 'Malevy_Leons'), ('./data/Ayo_Dosunmu.csv', 'Ayo_Dosunmu'), ('./data/Kobe_Brown.csv', 'Kobe_Brown'), ('./data/James_Johnson.csv', 'James_Johnson'), ('./data/Jeff_Dowtin_Jr_.csv', 'Jeff_Dowtin_Jr_'), ('./data/Oso_Ighodaro.csv', 'Oso_Ighodaro'), ('./data/Dominick_Barlow.csv', 'Dominick_Barlow'), ('./data/Julian_Champagnie.csv', 'Julian_Champagnie'), ('./data/Karlo_Matković.csv', 'Karlo_Matković'), ('./data/Yuki_Kawamura.csv', 'Yuki_Kawamura'), ('./data/Devin_Carter.csv', 'Devin_Carter'), ('./data/Shaedon_Sharpe.cs

In [11]:
for filepath, player in filepaths_and_players:
    
    df = transform_nba_game_log(filepath)
    team = df["Team"].unique()
    team_code = team[len(team)-1]
    output_dir = f"../backend/data/player_game_data/{team_code}"
    os.makedirs(output_dir, exist_ok=True)
    df.to_csv(f"{output_dir}/{player}.csv", index=False)

Index(['Date', 'Team', 'Unnamed: 2', 'Opp', 'Result', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       '+/-'],
      dtype='object')
