
Use Python and the NBA API to develop advanced machine learning model that predicts player performance metrics in upcoming game


<h3 style="color:black;font-family:'Segoe UI Variable Display';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;margin:0;font-weight:300;line-height:1;">Part 2.0</h3>
<h3 style="color:black;font-family:'Notes from Paris';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;margin:0;line-height:1;">Part 2.0</h3>
<h3 style="color:black;font-family:'Juicy Advice Outline';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Mencken Std';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Digital-7';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Proxima Nova';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Barlow Condensed';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>


<h3 style="color:black;font-family:'Lazy Crunch';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Abril Display';font-size:20px;text-shadow:0.125px 0.25px 0.25px black;margin:0;">Part 2.0</h3>



In [None]:
# main.py
import pandas as pd

from src.data_ingestion import (
    fetch_bulk_player_game_logs,
    get_player_game_logs,
    get_player_advanced_stats_parallel,
    get_opponent_stats      # ← add this line
)
from src.feature_engineering import feature_engineering_pipeline
from src.model_training import prepare_data, train_models, evaluate_model, DEFAULT_FEATURE_COLS
from src.utils import get_player_id, get_team_abbreviation_id_mapping
from src.aggregators import (
    add_opponent_position_allowed_pts,
    add_team_vs_opponent_allowed_pts
)


# --- Configuration ---
PLAYER_NAMES = [
    "Nikola Jokić", "Shai Gilgeous-Alexander", "Anthony Edwards"
]
season = '2024-25'

# --- Fetch & Process Data ---
print("=== Fetching Data ===")
bulk_logs_df = fetch_bulk_player_game_logs(season)
team_map = get_team_abbreviation_id_mapping()
opp_df = get_opponent_stats(season)   # ← add this line

all_player_data = pd.DataFrame()

for name in PLAYER_NAMES:
    pid = get_player_id(name)
    if not pid:
        continue

    logs = get_player_game_logs(pid, bulk_logs_df)
    if logs.empty:
        continue

    game_ids = logs['GAME_ID'].tolist()
    adv_stats = get_player_advanced_stats_parallel(pid, game_ids)
    if adv_stats.empty:
        continue

    logs['PLAYER_NAME'] = name
    merged = logs.merge(adv_stats, on=['GAME_ID', 'PLAYER_ID'], how='left')
    merged['PLAYER_ID'] = pid

    # Feature engineering
    processed = feature_engineering_pipeline(merged, team_map=team_map,opp_df=opp_df)
    all_player_data = pd.concat([all_player_data, processed], ignore_index=True)
    print(f"  ✅ Processed data for {name}")

# --- Apply Aggregator Features ---
print("=== Applying Aggregators ===")
all_player_data = add_opponent_position_allowed_pts(all_player_data)
all_player_data = add_team_vs_opponent_allowed_pts(all_player_data)

# --- Train & Evaluate Model ---
print("\n=== Training Model ===")
X_train_scaled, X_test_scaled, y_train, y_test, X_test_original = prepare_data(all_player_data)
best_model = train_models(X_train_scaled, y_train, X_test_scaled, y_test)
eval_df = evaluate_model(best_model, X_test_scaled, y_test, X_test_original)

# --- Next-Game Predictions ---
print("\n=== Predicting Next Game Points ===")
from src.prediction import predict_next_game
for name in PLAYER_NAMES:
    predict_next_game(name, DEFAULT_FEATURE_COLS, season)


=== Fetching Data ===
🔄 Loading bulk logs from cache: cache/bulk_logs_2024-25.parquet
  ✅ Processed data for Nikola Jokić
  ✅ Processed data for Shai Gilgeous-Alexander
  ✅ Processed data for Anthony Edwards
=== Applying Aggregators ===

=== Training Model ===

CatBoost Performance:
  RMSE: 8.27, MAE: 6.92, R2: 0.24

RandomForest Performance:
  RMSE: 8.41, MAE: 6.98, R2: 0.22

GradientBoosting Performance:
  RMSE: 8.71, MAE: 7.17, R2: 0.16

Ridge Performance:
  RMSE: 7.68, MAE: 6.25, R2: 0.35

BayesianRidge Performance:
  RMSE: 7.72, MAE: 6.40, R2: 0.34

Best model: Ridge with RMSE: 7.68

Evaluation on Test Data:
  RMSE: 7.68, MAE: 6.25, R2: 0.35

=== Predicting Next Game Points ===
No next game for Nikola Jokić


KeyError: "['PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'FG_PCT_AVG_LAST_5', 'OFF_RATING_AVG_LAST_5', 'PACE_PER40_AVG_LAST_5', 'OPPONENT_POSITION_ALLOWED_PTS', 'TEAM_VS_OPP_ALLOWED_PTS'] not in index"

In [None]:
import os
import time
import joblib
import warnings
from datetime import datetime, timedelta
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NBA API
from nba_api.stats.endpoints import (
    playergamelog, boxscoreadvancedv2,
    leaguedashteamstats, scoreboardv2, commonplayerinfo,
    leaguegamefinder, boxscoretraditionalv2
)
from nba_api.stats.static import players, teams

# Scikit-Learn & Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor
)
from sklearn.linear_model import Ridge, BayesianRidge
from catboost import CatBoostRegressor

warnings.filterwarnings("ignore")



Developing a machine learning model to predict NBA player performance metrics like points involves several steps:

Data Collection: Gather historical and current season data using the NBA API, including advanced statistics such as Player Impact Estimate (PIE), Efficiency (EFF), Player Efficiency Rating (PER), trends, opponent data, and more.

Data Preprocessing: Clean and preprocess the data to prepare it for modeling.

Feature Engineering: Create features that capture the important aspects influencing player performance.

Model Training: Choose and train a suitable machine learning model.

Model Evaluation: Assess the model's performance and fine-tune as necessary.

Prediction: Use the trained model to predict future player performance.

-----




1. Utility functions

In [None]:

# =============================================================================
# 1. Utility & Helper Functions
# =============================================================================

def get_player_id(player_name):
    """Get the NBA player ID given the player's full name."""
    nba_players = players.get_players()
    player = next((p for p in nba_players if p['full_name'].lower() == player_name.lower()), None)
    return player['id'] if player else None

def get_team_abbreviation_id_mapping():
    """Return a dict mapping team abbreviations to team IDs."""
    nba_teams = teams.get_teams()
    return {team['abbreviation']: team['id'] for team in nba_teams}

def get_player_team_id(player_id):
    """Get the player's current team ID."""
    try:
        df = commonplayerinfo.CommonPlayerInfo(player_id=player_id).get_data_frames()[0]
        return int(df['TEAM_ID'].iloc[0])
    except:
        return None

def get_team_name(team_id):
    """Get the full team name given the team ID."""
    nba_teams = teams.get_teams()
    team = next((t for t in nba_teams if t['id'] == team_id), None)
    return team['full_name'] if team else 'Unknown Team'


-----------------------------------

2. Data Fetching Functions

In [None]:
from nba_api.stats.endpoints import leaguegamelog

# Cache the bulk logs so you don’t hit the API over and over
_bulk_logs_cache = None

def fetch_bulk_player_game_logs(season='2024-25'):
    global _bulk_logs_cache
    if _bulk_logs_cache is None:
        print("Fetching bulk game logs…")
        lg = leaguegamelog.LeagueGameLog(season=season, player_or_team_abbreviation='P', timeout=60)
        df = lg.get_data_frames()[0]
        df.columns = df.columns.str.upper()
        _bulk_logs_cache = df
    return _bulk_logs_cache

def get_player_game_logs(player_id, season='2024-25'):
    """
    Previously fetched each player’s logs one by one.
    Now pulls the full season at once then filters.
    """
    bulk = fetch_bulk_player_game_logs(season)
    player_df = bulk[bulk['PLAYER_ID'] == int(player_id)].copy()
    if player_df.empty:
        return pd.DataFrame()
    # keep column names consistent
    return player_df


In [None]:

# =============================================================================
# 2. Data Fetching Functions
# =============================================================================

# def get_player_game_logs(player_id, season='2024-25'):
#     """Fetch player game logs for the given season."""
#     try:
#         gamelog = playergamelog.PlayerGameLog(player_id=player_id, season=season, timeout=60)
#         df = gamelog.get_data_frames()[0]
#         df.columns = df.columns.str.upper()
#         return df
#     except:
#         return pd.DataFrame()

def get_player_advanced_stats(player_id, season='2024-25'):
    """Fetch advanced stats for all games played by the player in the specified season."""
    gamelog_df = get_player_game_logs(player_id, season)
    adv_stats = []
    for game_id in gamelog_df['GAME_ID']:
        try:
            boxscore = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id, timeout=60)
            p_stats = boxscore.player_stats.get_data_frame()
            p_adv = p_stats[p_stats['PLAYER_ID'] == int(player_id)]
            adv_stats.append(p_adv)
        except:
            pass
        time.sleep(0.5)

    if adv_stats:
        df = pd.concat(adv_stats, ignore_index=True)
        return df[['GAME_ID', 'PLAYER_ID', 'USG_PCT', 'PIE', 'TEAM_ID', 'OFF_RATING', 'PACE_PER40']]
    return pd.DataFrame()

def get_opponent_stats(season='2024-25'):
    """Fetch opponent defensive stats for all teams."""
    df = leaguedashteamstats.LeagueDashTeamStats(
        season=season, measure_type_detailed_defense='Defense',
        per_mode_detailed='PerGame', timeout=60
    ).get_data_frames()[0]
    return df[['TEAM_ID', 'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']]


--------------------

3. Feature Engineering Functions

In [None]:
# =============================================================================
# 3. Feature Engineering
# =============================================================================

def compute_efficiency(df):
    """Compute efficiency metric = (PTS + REB + AST + STL + BLK) - (FGA - FGM) - (FTA - FTM) - TOV."""
    df['EFF'] = (df['PTS'] + df['REB'] + df['AST'] + df['STL'] + df['BLK']
                 - (df['FGA'] - df['FGM']) - (df['FTA'] - df['FTM']) - df['TOV'])
    return df

def compute_true_shooting_percentage(df):
    df['TS_DENOM'] = 2 * (df['FGA'] + 0.44 * df['FTA'])
    df['TS_PCT'] = df.apply(
        lambda row: row['PTS'] / row['TS_DENOM'] if row['TS_DENOM'] != 0 else 0, axis=1
    )
    df.drop(columns=['TS_DENOM'], inplace=True)
    return df

def get_player_position(player_id, cache=None):
    if cache is None:
        cache = {}
    if player_id in cache:
        return cache[player_id]
    pos = None
    try:
        info = commonplayerinfo.CommonPlayerInfo(player_id=player_id).get_data_frames()[0]
        raw_pos = info.get('POSITION', [''])[0]
        if isinstance(raw_pos, str):
            main_pos = raw_pos.split('-')[0].title()
            if 'Guard' in main_pos:
                pos = 'G'
            elif 'Forward' in main_pos:
                pos = 'F'
            elif 'Center' in main_pos:
                pos = 'C'
    except:
        pass
    cache[player_id] = pos
    return pos

In [None]:
def feature_engineering(player_df, adv_df, opp_df, team_map):
    # 1) Basic computations
    player_df = compute_efficiency(player_df)
    player_df = compute_true_shooting_percentage(player_df)

    # 2) Merge advanced stats
    df = pd.merge(player_df, adv_df, on=['GAME_ID', 'PLAYER_ID'], how='left')
    df['OPPONENT_ABBREVIATION'] = df['MATCHUP'].str.split(' ').str[-1]
    df['OPPONENT_TEAM_ID'] = df['OPPONENT_ABBREVIATION'].map(team_map)

    # 3) Merge defensive stats
    if opp_df is not None and not opp_df.empty:
        df = pd.merge(df, opp_df, left_on='OPPONENT_TEAM_ID', right_on='TEAM_ID', how='left')
        for col in ['DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']:
            df[col] = df[col].fillna(df[col].mean())

    # 4) Sort by date and parse
    df.sort_values(['PLAYER_NAME', 'GAME_DATE'], inplace=True)
    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'], errors='coerce')

    # 5) Convert minutes to float
    def parse_minutes(x):
        if isinstance(x, str) and ':' in x:
            mins, secs = x.split(':')
            return float(mins) + float(secs)/60
        return float(x) if pd.notna(x) else 0

    df['MIN'] = df['MIN'].apply(parse_minutes)
    df['FG_PCT'] = df['FGM'] / df['FGA'].replace(0, np.nan)

    # 6) Compute rolling stats
    rolling_cols = ['PIE', 'USG_PCT', 'PTS', 'REB', 'AST', 'EFF', 'TS_PCT', 'MIN', 'FG_PCT', 'OFF_RATING', 'PACE_PER40']
    for c in rolling_cols:
        df[f'{c}_AVG_LAST_5'] = (df.groupby('PLAYER_NAME')[c]
                                 .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean()))
        
    for c in ['PTS', 'USG_PCT', 'MIN']:
        df[f'{c}_VOL_LAST_5'] = (df.groupby('PLAYER_NAME')[c]
                                 .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).std()))

    # 7) Compute season average for PTS
    df['PTS_SEASON_AVG'] = (df.groupby('PLAYER_NAME')['PTS']
                              .transform(lambda x: x.shift(1).expanding().mean()))

    # 8) Additional features
    df['HOME_GAME'] = df['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)
    df['REST_DAYS'] = df.groupby('PLAYER_NAME')['GAME_DATE'].diff().dt.days.fillna(0)

    # Rename and drop extraneous columns
    if 'TEAM_ID_x' in df.columns:
        df.rename(columns={'TEAM_ID_x': 'TEAM_ID'}, inplace=True)
    df.drop(columns=['TEAM_ID_y'], errors='ignore', inplace=True)

    # Forward/back fill to handle any missing rolling stats
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)

    return df

In [None]:

# =============================================================================
# 4. Aggregator Functions (Position & Team vs Opponent)
# =============================================================================

def add_player_position(df):
    """Add a 'POSITION' column (G, F, C) to each row."""
    cache = {}
    df['POSITION'] = df['PLAYER_ID'].apply(lambda pid: get_player_position(pid, cache))
    return df.dropna(subset=['POSITION'])

def compute_position_allowed_pts(df):
    """Calculate how many points each team concedes on average to a specific position."""
    agg = df.groupby(['OPPONENT_TEAM_ID', 'POSITION'])['PTS'].mean().reset_index()
    agg.rename(columns={'PTS': 'OPPONENT_POSITION_ALLOWED_PTS'}, inplace=True)
    return agg

def add_opponent_position_allowed_pts(df):
    """Merge the 'OPPONENT_POSITION_ALLOWED_PTS' back to the main DataFrame."""
    df_pos = add_player_position(df)
    agg = compute_position_allowed_pts(df_pos)
    return pd.merge(df_pos, agg, on=['OPPONENT_TEAM_ID', 'POSITION'], how='left')

def compute_team_vs_opponent_allowed_pts(df):
    """Compute average points a TEAM_ID scores vs. a specific OPPONENT_TEAM_ID."""
    agg = (df.groupby(['TEAM_ID', 'OPPONENT_TEAM_ID'])['PTS']
             .mean().reset_index(name='TEAM_VS_OPP_ALLOWED_PTS'))
    return agg

def add_team_vs_opponent_allowed_pts(df):
    """Merge 'TEAM_VS_OPP_ALLOWED_PTS' back to the main DataFrame."""
    agg = compute_team_vs_opponent_allowed_pts(df)
    return pd.merge(df, agg, on=['TEAM_ID', 'OPPONENT_TEAM_ID'], how='left')


---------------------------------

4. Data Preparation Functions

In [None]:
# =============================================================================
# 5. Data Splitting & Preparation
# =============================================================================

DEFAULT_FEATURE_COLS = [
    'PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'EFF_AVG_LAST_5', 'TS_PCT_AVG_LAST_5',
    'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'HOME_GAME', 'REST_DAYS',
    'PTS_AVG_LAST_5', 'REB_AVG_LAST_5', 'AST_AVG_LAST_5', 'FG_PCT_AVG_LAST_5',
    'MIN_AVG_LAST_5', 'OFF_RATING_AVG_LAST_5', 'PACE_PER40_AVG_LAST_5', 'PTS_SEASON_AVG', 
    'OPPONENT_POSITION_ALLOWED_PTS', 'TEAM_VS_OPP_ALLOWED_PTS',
    'PTS_VOL_LAST_5', 'USG_PCT_VOL_LAST_5', 'MIN_VOL_LAST_5',
    #'STARTERS_MISSING'  # newly added feature
]
feature_columns_list = ['PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'EFF_AVG_LAST_5', 'TS_PCT_AVG_LAST_5',
    'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'HOME_GAME', 'REST_DAYS',
    'PTS_AVG_LAST_5', 'REB_AVG_LAST_5', 'AST_AVG_LAST_5', 'FG_PCT_AVG_LAST_5',
    'MIN_AVG_LAST_5', 'OFF_RATING_AVG_LAST_5', 'PACE_PER40_AVG_LAST_5', 'PTS_SEASON_AVG', 
    'OPPONENT_POSITION_ALLOWED_PTS', 'TEAM_VS_OPP_ALLOWED_PTS',
    'PTS_VOL_LAST_5', 'USG_PCT_VOL_LAST_5', 'MIN_VOL_LAST_5',
    #'STARTERS_MISSING'  # newly added feature
]

In [None]:
def prepare_data(df, feature_cols=DEFAULT_FEATURE_COLS):
    df = df.dropna(subset=feature_cols)
    X = df[feature_cols].copy()
    X['PLAYER_NAME'] = df['PLAYER_NAME'] # Keep player_name for reference
    y = df['PTS']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_test_original = X_test.copy() # Keep a copy of X_test before scaling
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.drop(columns=['PLAYER_NAME']))
    X_test_scaled = scaler.transform(X_test.drop(columns=['PLAYER_NAME']))

    os.makedirs('lib', exist_ok=True)
    joblib.dump(scaler, 'lib/scaler.pkl')
    return X_train_scaled, X_test_scaled, y_train, y_test, X_test_original #X_test['PLAYER_NAME'].values


----------------------------

5. Model Training and Evaluation

In [None]:
# =============================================================================
# 6. Modeling & Evaluation
# =============================================================================
def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    models = {
        'CatBoost': CatBoostRegressor(random_state=42, verbose=0),
        'RandomForest': RandomForestRegressor(random_state=42),
        'GradientBoosting': GradientBoostingRegressor(random_state=42),
        'Ridge': Ridge(),
        'BayesianRidge': BayesianRidge(),
        #'Lasso': Lasso(),
        #'ElasticNet': ElasticNet(),
        #'LassoLars': LassoLars(),
        #'SGDRegressor': SGDRegressor(),
    }

    best_model = None
    best_rmse = float('inf')

    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        print(f"\n{name} Performance:\n  RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model

    print(f"\nBest model: {type(best_model).__name__} with RMSE: {best_rmse:.2f}")
    return best_model

def evaluate_model(model, X_test_scaled, y_test, X_test_original):
    preds = model.predict(X_test_scaled) # Make predictions
    rmse = np.sqrt(mean_squared_error(y_test, preds)) # Evaluate metrics
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"\nEvaluation on Test Data:\n  RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}")
    
    # Build eval_df 
    eval_df = X_test_original.reset_index(drop=True).copy()
    eval_df['Actual_PTS']    = y_test.reset_index(drop=True)
    eval_df['Predicted_PTS'] = preds
    eval_df['Residual'] = eval_df['Actual_PTS'] - eval_df['Predicted_PTS']
    
    print("\nSample Predictions:")
    print(eval_df[['PLAYER_NAME', 'Actual_PTS', 'Predicted_PTS', 'Residual']].head(10))    
    return eval_df


--------------------------------

7. Model Prediction Functions

In [None]:
# =============================================================================
# 7. Next-Game Prediction Logic
# =============================================================================

def get_team_defensive_stats(team_id, season='2024-25'):
    try:
        df = leaguedashteamstats.LeagueDashTeamStats(
            team_id_nullable=team_id, season=season,
            measure_type_detailed_defense='Defense', per_mode_detailed='PerGame',
            timeout=60
        ).get_data_frames()[0]
        return df[['TEAM_ID', 'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']].iloc[0]
    except:
        return pd.Series([np.nan]*4, index=['TEAM_ID', 'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE'])


def get_next_game_info(player_team_id):
    next_game_date = datetime.now() #+ timedelta(days=1)
    max_days_ahead = 14

    for _ in range(max_days_ahead):
        game_date_str = next_game_date.strftime('%Y-%m-%d')
        try:
            scoreboard = scoreboardv2.ScoreboardV2(game_date=game_date_str)
            games = scoreboard.game_header.get_data_frame()
            team_games = games[(games['HOME_TEAM_ID'] == player_team_id) | (games['VISITOR_TEAM_ID'] == player_team_id)]
            if not team_games.empty:
                next_game = team_games.iloc[0]
                opponent_team_id = next_game['VISITOR_TEAM_ID'] if next_game['HOME_TEAM_ID'] == player_team_id else next_game['HOME_TEAM_ID']
                home_game = 1 if next_game['HOME_TEAM_ID'] == player_team_id else 0
                return next_game_date, opponent_team_id, home_game
        except:
            pass
        next_game_date += timedelta(days=1)

    return None, None, None

In [None]:
def prepare_features_for_prediction(player_id, player_name, season='2024-25', feature_cols=DEFAULT_FEATURE_COLS, 
                                    df_agg_position=None, df_agg_team_vs_opp=None):
    # 1) Pull player's logs and advanced stats.
    logs_df = get_player_game_logs(player_id, season)
    if logs_df.empty:
        print(f"No game logs for {player_name} in {season}.")
        return None, None
    logs_df['GAME_DATE'] = pd.to_datetime(logs_df['GAME_DATE'])
    logs_df.sort_values('GAME_DATE', ascending=False, inplace=True)
    logs_df['PLAYER_NAME'] = player_name

    # Team & next game
    p_team_id = get_player_team_id(player_id)
    next_game_date, opp_team_id, home_game = get_next_game_info(p_team_id)
    if not next_game_date:
        print(f"No upcoming game found for {player_name}.")
        return None, None

    # Opponent stats
    opp_stats = get_team_defensive_stats(opp_team_id, season)
    # Advanced stats
    adv_df = get_player_advanced_stats(player_id, season)

    # Build a small historical subset
    team_map = get_team_abbreviation_id_mapping()
    recent_logs = logs_df[logs_df['GAME_DATE'] <= logs_df['GAME_DATE'].max()]
    adv_df = adv_df[adv_df['GAME_ID'].isin(recent_logs['GAME_ID'])]
    final_df = feature_engineering(recent_logs, adv_df, None, team_map)

    if final_df.empty:
        return None, None

    # 2) Feature-engineer a 'latest_data' row (from last game).
    # 3) Insert upcoming game info: rest days, home/away, opponent stats.
    latest_data = final_df.iloc[-1].copy()
    latest_data['REST_DAYS'] = (next_game_date - latest_data['GAME_DATE']).days
    latest_data['HOME_GAME'] = home_game
    latest_data['GAME_DATE'] = next_game_date
    latest_data['OPPONENT_TEAM_ID'] = opp_team_id
    latest_data['DEF_RATING'] = opp_stats['DEF_RATING']
    latest_data['OPP_PTS_OFF_TOV'] = opp_stats['OPP_PTS_OFF_TOV']
    latest_data['OPP_PTS_2ND_CHANCE'] = opp_stats['OPP_PTS_2ND_CHANCE']
    
    # 4) Merge aggregator stats for position and team vs. opponent.
    # Position-based aggregator
    if df_agg_position is not None:
        pos = get_player_position(player_id)
        if not pos:  # fallback
            pos = 'G'
        row_match = df_agg_position[
            (df_agg_position['OPPONENT_TEAM_ID'] == opp_team_id) & (df_agg_position['POSITION'] == pos)
        ]
        if not row_match.empty:
            latest_data['OPPONENT_POSITION_ALLOWED_PTS'] = row_match['OPPONENT_POSITION_ALLOWED_PTS'].iloc[0]
        else:
            # fallback to position average
            fallback_pos = df_agg_position[df_agg_position['POSITION'] == pos]
            latest_data['OPPONENT_POSITION_ALLOWED_PTS'] = fallback_pos['OPPONENT_POSITION_ALLOWED_PTS'].mean()

    # Team vs Opp aggregator
    if df_agg_team_vs_opp is not None:
        row_match = df_agg_team_vs_opp[
            (df_agg_team_vs_opp['TEAM_ID'] == p_team_id) & (df_agg_team_vs_opp['OPPONENT_TEAM_ID'] == opp_team_id)
        ]
        if not row_match.empty:
            latest_data['TEAM_VS_OPP_ALLOWED_PTS'] = row_match['TEAM_VS_OPP_ALLOWED_PTS'].iloc[0]
        else:
            latest_data['TEAM_VS_OPP_ALLOWED_PTS'] = df_agg_team_vs_opp['TEAM_VS_OPP_ALLOWED_PTS'].mean()

    # Build the final feature vector
    try:
        fv = latest_data[feature_cols].values.reshape(1, -1)
        return fv, latest_data
    except KeyError as e:
        print(f"Missing columns for {player_name}: {e}")
        return None, None



In [None]:
def predict_upcoming_points(player_name, season='2024-25', feature_cols=DEFAULT_FEATURE_COLS, 
                            df_agg_position=None, df_agg_team_vs_opp=None):
    player_id = get_player_id(player_name)
    if not player_id:
        print(f"Invalid player: {player_name}")
        return None

    fv, latest_data = prepare_features_for_prediction(player_id, player_name, season, feature_cols=feature_cols,
                                                      df_agg_position=df_agg_position, df_agg_team_vs_opp=df_agg_team_vs_opp)
    if fv is None:
        return None
    
    # Scale & predict
    scaler = joblib.load('lib/scaler.pkl')
    model = joblib.load('lib/player_points_model.pkl')
    fv_scaled = scaler.transform(fv)
    pred = model.predict(fv_scaled)
    opp_name = get_team_name(latest_data['OPPONENT_TEAM_ID'])
    date_str = latest_data['GAME_DATE'].strftime('%Y-%m-%d')

    print(f"Predicted points for {player_name} on {date_str} vs {opp_name}: {pred[0]:.2f}")
    return pred[0]


-------

8. Feature Importance

-----

In [None]:
# =============================================================================
# 8. Example Usage 
# =============================================================================

player_names = [
    "LeBron James", "Stephen Curry", "Giannis Antetokounmpo", "Luka Dončić", 
    # "Kevin Durant", "Joel Embiid", "Victor Wembanyama", "Damian Lillard", "Anthony Davis", "Domantas Sabonis",
    "Jayson Tatum", "Nikola Jokić", "Shai Gilgeous-Alexander", "Karl-Anthony Towns",
    "Donovan Mitchell", "James Harden", "Anthony Edwards", "Jimmy Butler",
    # "Kyrie Irving", "De'Aaron Fox", #"Bronny James", "Zion Williamson", "Tyrese Maxey", 
    "Jalen Brunson", "Trae Young", "Pascal Siakam", "Jalen Green",
    "Darius Garland", "Jalen Williams", "Jaylen Brown",  "Paolo Bunchero",
    "Norman Powell", "Alperen Şengün", "Ja Morant", "Jaren Jackson Jr.",
    ]

season = '2024-25'
opponent_stats = get_opponent_stats(season)
team_map = get_team_abbreviation_id_mapping()

all_player_data = pd.DataFrame()
for p_name in player_names:
    p_id = get_player_id(p_name)
    if not p_id:
        continue
    p_gamelog = get_player_game_logs(p_id, season)
    adv_stats = get_player_advanced_stats(p_id, season)
    if p_gamelog.empty or adv_stats.empty:
        continue
    
    p_gamelog['PLAYER_NAME'] = p_name
    merged_df = feature_engineering(p_gamelog, adv_stats, opponent_stats, team_map)
    all_player_data = pd.concat([all_player_data, merged_df], ignore_index=True)

# 3) Position & Team aggregator
all_player_data = add_opponent_position_allowed_pts(all_player_data)
all_player_data = add_team_vs_opponent_allowed_pts(all_player_data)


In [None]:
# X_train_scaled, X_test_scaled, y_train, y_test, p_names_test = prepare_data(all_player_data)
X_train_scaled, X_test_scaled, y_train, y_test, X_test_original = prepare_data(all_player_data) 
best_model = train_and_evaluate_models(X_train_scaled, y_train, X_test_scaled, y_test)
joblib.dump(best_model, 'lib/player_points_model.pkl')

# Build eval_df for residual analysis | after training:
eval_df = evaluate_model(best_model, X_test_scaled, y_test, X_test_original)


In [None]:
# 7) Predict upcoming game for each player
#    (requires the aggregator dataframes if we want position-based features)
df_agg_position = compute_position_allowed_pts(all_player_data)  # merges OPP_TEAM & POSITION -> mean(PTS)
df_agg_team_opp = compute_team_vs_opponent_allowed_pts(all_player_data)
for name in player_names:
    predict_upcoming_points(
        name, season, df_agg_position=df_agg_position, df_agg_team_vs_opp=df_agg_team_opp)

-------

-----

###### Evaluation Residuals



In [None]:

# Now we can do residual analysis
plt.figure(figsize=(8,5))
sns.histplot(eval_df['Residual'], kde=True, bins=20)
plt.title("Histogram of Residuals")
plt.show()


###### Starters Missing

###### Time Series Cross Validation



In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

def time_series_cv_evaluation(df, feature_cols, target_col='PTS', n_splits=5):
    """Perform time-series cross-validation with n_splits folds."""
    # Sort by date
    df_sorted = df.sort_values(by='GAME_DATE').reset_index(drop=True)
    X_full = df_sorted[feature_cols].copy()
    y_full = df_sorted[target_col].values

    # Prepare cross-validator
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rmse_scores = []
    r2_scores = []
    fold_number = 1
    for train_index, val_index in tscv.split(X_full):
        X_train, X_val = X_full.iloc[train_index], X_full.iloc[val_index] # Split
        y_train, y_val = y_full[train_index], y_full[val_index]

        scaler = StandardScaler() # Scale
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled   = scaler.transform(X_val)

        # (You can choose whichever model you want here. Let's do a simple RandomForest as an example.)
        model = RandomForestRegressor(random_state=42)
        model.fit(X_train_scaled, y_train)

        # Predict
        val_preds = model.predict(X_val_scaled)
        rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        rmse_scores.append(rmse)

        r2 = model.score(X_val_scaled, y_val)
        r2_scores.append(r2)
        
        print(f"Fold {fold_number} RMSE = {rmse:.3f} R-Squared = {r2:.3f}")
        fold_number += 1

    avg_rmse = np.mean(rmse_scores)
    avg_r2 = np.mean(r2_scores)
    print(f"\nAverage RMSE over {n_splits} folds: {avg_rmse:.3f}")
    print(f"\nAverage R-Squared over {n_splits} folds: {avg_r2:.3f}")
    return avg_rmse



In [None]:
time_series_cv_evaluation(all_player_data, DEFAULT_FEATURE_COLS, target_col='PTS', n_splits=5)

----


###### Position and Role-Based Features:

Include data about the player's role (e.g., starter vs. bench), player position, and how the upcoming opponent typically defends that position.


In [None]:
from lib.build_player_team_data import main
df_player_full = main(season='2024-25', data_file="data/merged_player_team_dataset.csv")

------------------------------------



Hyperparameter Tuning and Bayesian Optimization:
Instead of a simple grid search, use advanced hyperparameter optimization methods (e.g., Bayesian optimization or Optuna) to find the best parameters for CatBoost, XGBoost, LightGBM, or neural networks.
Non-Linear Models and Neural Networks:
Consider deep learning approaches. A simple feed-forward neural network or LSTM/RNN if you structure your data as a time series could capture temporal dependencies more effectively.
Time-Series Aware Validation:

Ensure you use proper time-series cross-validation (e.g., TimeSeriesSplit) so the model isn't accidentally leaking future information.


Betting Lines or Market Data:
Market-based indicators (like the Vegas over/under for the game) can indirectly capture external knowledge about expected scoring environment.


Dimensionality Reduction and Feature Selection:

Feature Importance and Pruning:
Use model explainability tools (SHAP, feature_importances_) to identify less useful features and remove them.

-----

# Star Player Out

Handling Injuries / Roster Changes

If a key teammate is absent, a star player’s usage might spike. Consider adding a feature that tracks “number of typical starters missing” for a given game. That often impacts scoring opportunities.

Explainability

Tools like SHAP or feature importances from tree-based models can help you see which inputs drive the predictions. This can help you debug or refine features.



######  Pipeline Packaging

Once stable, you can wrap the entire pipeline in a script (or notebook) that daily:
Pulls updated logs.
Retrains/updates model (if desired) or just re-scores with the existing model.
Outputs next-game predictions to a CSV or database.




--------------


# Injury Report

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def scrape_espn_injuries(url="https://www.espn.com/nba/injuries"):
    """Returns a DataFrame with columns: [TEAM_NAME, PLAYER_NAME, POS, EST_RETURN, STATUS, COMMENT]."""
    # 1) Add headers that mimic a real browser
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/107.0.0.0 Safari/537.36"
        )
    }
    # 2) Make the request
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch {url}, status code: {response.status_code}")
        return pd.DataFrame()

    # 3) Parse HTML
    soup = BeautifulSoup(response.text, "html.parser")

    # 4) The main container for injuries is identified by:
    #    <div class="ResponsiveTable Table__league-injuries"> ...table content...
    #    We'll find all such sections for different teams.

    # "ResponsiveTable Table__league-injuries" is repeated per team
    # We might see multiple <div> blocks with that class
    team_tables = soup.find_all("div", class_="ResponsiveTable Table__league-injuries")

    all_rows = [] # We'll store results in a list of dicts

    for table_div in team_tables:
        # Each 'table_div' should contain a <div class="Table__Title"> for the team name
        # and a <table> with <thead>/<tbody> for the injuries.

        # 1) Get the Team Name
        title_div = table_div.find("div", class_="Table__Title")
        if not title_div:
            # If we can't find the title, skip
            continue

        # The team name is often in: <span class="injuries__teamName ...">TEAM NAME</span>
        team_name_span = title_div.find("span", class_="injuries__teamName")
        if not team_name_span:
            continue
        team_name = team_name_span.get_text(strip=True)

        # 2) The <table> has a <thead> and <tbody> with multiple <tr> rows.
        # Typically: <tbody class="Table__TBODY"><tr> ... <td> ... etc.
        table_tag = table_div.find("table", class_="Table")
        if not table_tag:
            continue

        tbody = table_tag.find("tbody", class_="Table__TBODY")
        if not tbody:
            continue

        # 3) Each row in the <tbody> is one player's injury record
        rows = tbody.find_all("tr", class_="Table__TR")
        for row in rows:
            # We have multiple <td> columns: NAME, POS, EST. RETURN DATE, STATUS, COMMENT
            tds = row.find_all("td", class_="Table__TD")
            if len(tds) < 5:
                # Expect at least 5 columns
                continue

            player_name = tds[0].get_text(strip=True)
            pos = tds[1].get_text(strip=True)
            est_return = tds[2].get_text(strip=True)
            status = tds[3].get_text(strip=True)
            comment = tds[4].get_text(strip=True)

            # Store in a dict
            all_rows.append({
                "TEAM_NAME": team_name,
                "PLAYER_NAME": player_name,
                "POS": pos,
                "EST_RETURN": est_return,
                "STATUS": status, 
                "COMMENT": comment
            })

    # Convert to DataFrame
    df_injury = pd.DataFrame(all_rows)
    return df_injury


df_injury = scrape_espn_injuries("https://www.espn.com/nba/injuries")
if df_injury.empty:
    print("No data or scraping failed.")

# Add a DATA_DATE
df_injury["DATA_DATE"] = datetime.now().strftime("%Y-%m-%d")

# Save to CSV
csv_file = f"data/injury_reports/injury_report_{df_injury['DATA_DATE'].iloc[0]}.csv"
df_injury.to_csv(csv_file, index=False)
print(f"Saved injury data to {csv_file}")



In [None]:
df_injury.head()

Star Player Identification

You can use “typical_starters_dict” or any advanced approach to identify who’s considered a “key player” for each team. Then, if they’re out, your model can see a bigger effect on the minutes or usage of the rest of the team.

Minutes vs. Points

Often, you’ll first build a minutes model (that uses IS_OUT, TEAM_HAS_STAR_OUT) and outputs MIN_PROJ. Then you feed MIN_PROJ + other features into your points model.
The partial historical injuries help that minutes model learn “When star is out, player X’s minutes jump by 5.”

------

----

4. Pipeline Automation & Data Storage
4.1 Scheduled Updates
Set up a daily or weekly job that:
Pulls fresh game logs and advanced box scores via the NBA API (or your own data store).
Updates your training dataset.
Optionally retrains or re-fits the model.
Generates new next-game predictions for each player.
4.2 Data Storage
Use a database (e.g., SQLite, PostgreSQL) or a Cloud Data Warehouse (BigQuery, Snowflake) to store:
Historical game logs
Player metadata (e.g., birthdate, draft year, position, injuries)
Team stats / synergy metrics
This central repository simplifies repeated queries and ensures you have a single source of truth.
4.3 Version Control & Logging
Version your model artifacts (e.g., model_v1.0.pkl, model_v1.1.pkl) and keep a record of:
Date of training
Hyperparameters
Performance metrics
Log daily predictions and compare them later to actual game results to measure real-time accuracy.

. Perform a Residual Analysis
Generate a residual DataFrame: df_residuals = actual_points - predicted_points.
Plot histograms, scatter plots (residuals vs. minutes or usage), or groupby stats (residuals by team or position).
Look for patterns that might suggest missing features or systematic biases.

------

In [None]:
from lib.cleanup_script import remove_markdown_blocks_and_reformat

remove_markdown_blocks_and_reformat("notebooks/test.py", "notebooks/test_cleaned.py")


------

In [None]:
from nba_api.stats.endpoints import teamplayerdashboard

def get_top_5_players_by_minutes(team_id, season='2024-25'):
    """
    Return a list of the player IDs of the top 5 players on a given team,
    sorted by average minutes played in that season.
    """
    try:
        # teamplayerdashboard gives per-game stats for each player on the team
        dashboard = teamplayerdashboard.TeamPlayerDashboard(
            team_id=team_id,
            season=season,
            per_mode_detailed='PerGame',
            timeout=60
        )
        df_players = dashboard.get_data_frames()[1]  # [1] is the TeamPlayerDashboard table
        df_players = df_players.sort_values('MIN', ascending=False)
        # Return top 5 player IDs
        return df_players['PLAYER_ID'].head(5).tolist()
    except:
        return []

def build_team_starters_dict(unique_team_ids, season='2024-25'):
    """
    Build a dictionary that maps each TEAM_ID -> list of top-5 starter player IDs.
    """
    team_starters = {}
    for tid in unique_team_ids:
        top_5_list = get_top_5_players_by_minutes(tid, season)
        team_starters[tid] = top_5_list
    return team_starters


In [None]:
from nba_api.stats.endpoints import boxscoretraditionalv2

def compute_starters_missing_for_games(df, team_starters, season='2024-25'):
    """
    For each unique GAME_ID in df, fetch the box score. 
    Count how many of each team's top-5 starters did not play.
    Return a dict with keys = (GAME_ID, TEAM_ID) and values = number_of_missing_starters.
    """
    # Collect unique game IDs
    unique_game_ids = df['GAME_ID'].unique().tolist()
    
    # We'll store our results in a dict:  {(game_id, team_id): missing_count}
    missing_starters_dict = {}
    
    for game_id in unique_game_ids:
        try:
            # Pull the boxscore
            boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id, timeout=60)
            box_df = boxscore.get_data_frames()[0]
            
            # Filter only those who actually played (MIN > 0) 
            # or you can check if "PLAYER_NAME" is present, etc.
            players_who_played = box_df[box_df['MIN'] != '0:00']['PLAYER_ID'].unique().tolist()
            
            # Now box_df also has a 'TEAM_ID' column. We'll iterate by team to see who missed the game
            for tid in box_df['TEAM_ID'].unique():
                top_5 = team_starters.get(tid, [])
                if not top_5: 
                    # if no data, assume zero missing for safety
                    missing_starters_dict[(game_id, tid)] = 0
                    continue

                missing_count = sum(player_id not in players_who_played for player_id in top_5)
                missing_starters_dict[(game_id, tid)] = missing_count
                
        except:
            # If something fails, fallback to zero missing
            for tid in df[df['GAME_ID'] == game_id]['TEAM_ID'].unique():
                missing_starters_dict[(game_id, tid)] = 0

    return missing_starters_dict


In [None]:
def add_starters_missing_features(df, missing_starters_dict):
    """
    For each row in df, add 'TEAM_STARTERS_MISSING' and 'OPP_STARTERS_MISSING'.
    We assume 'TEAM_ID' and 'OPPONENT_TEAM_ID' exist in df.
    """
    df['TEAM_STARTERS_MISSING'] = df.apply(
        lambda row: missing_starters_dict.get((row['GAME_ID'], row['TEAM_ID']), 0),
        axis=1
    )
    df['OPP_STARTERS_MISSING'] = df.apply(
        lambda row: missing_starters_dict.get((row['GAME_ID'], row['OPPONENT_TEAM_ID']), 0),
        axis=1
    )
    return df


////////

In [None]:
import concurrent.futures
from nba_api.stats.endpoints import boxscoretraditionalv2

def fetch_boxscore(game_id):
    """
    Fetch box score data for a single GAME_ID.
    Returns a DataFrame. If API call fails, returns empty DataFrame.
    """
    try:
        boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id, timeout=60)
        return boxscore.get_data_frames()[0]
    except:
        return pd.DataFrame()

def compute_starters_missing_for_games(df, team_starters, season='2024-25', max_workers=8):
    """
    For each unique GAME_ID in df, fetch the box score in parallel.
    Then compute how many of each team's top-5 starters did not play.

    Args:
        df (pd.DataFrame): DataFrame with columns including 'GAME_ID' and 'TEAM_ID'.
        team_starters (dict): {TEAM_ID -> [top5_player_ids]}
        season (str): '2024-25' by default.
        max_workers (int): Number of threads for parallel requests.

    Returns:
        dict: {(GAME_ID, TEAM_ID): missing_starters_count}
    """
    unique_game_ids = df['GAME_ID'].unique().tolist()

    # 1) Parallel fetch all boxscores for unique_game_ids
    boxscore_cache = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_gid = {
            executor.submit(fetch_boxscore, gid): gid for gid in unique_game_ids
        }
        for future in concurrent.futures.as_completed(future_to_gid):
            gid = future_to_gid[future]
            try:
                boxscore_cache[gid] = future.result()
            except Exception as exc:
                print(f'[Error] BoxScore fetch failed for {gid}: {exc}')
                boxscore_cache[gid] = pd.DataFrame()  # fallback to empty

    # 2) For each game, figure out how many top-5 starters didn't play
    missing_starters_dict = {}
    for game_id, box_df in boxscore_cache.items():
        if box_df.empty:
            # If we have no data, fallback to 0 for every team in that game
            relevant_teams = df[df['GAME_ID'] == game_id]['TEAM_ID'].unique()
            for tid in relevant_teams:
                missing_starters_dict[(game_id, tid)] = 0
            continue

        # Filter to players who actually played (MIN != "0:00" or "0")
        # (some boxscores have "0" or "00:00", so let's handle either).
        box_df = box_df[~box_df['MIN'].isin(["0:00", "0"])]
        players_who_played = box_df['PLAYER_ID'].unique().tolist()

        # For each team in this boxscore
        for tid in box_df['TEAM_ID'].unique():
            top5 = team_starters.get(tid, [])
            missing_count = sum(pid not in players_who_played for pid in top5)
            missing_starters_dict[(game_id, tid)] = missing_count

        # Also cover edge cases if some teams in df aren't in box_df
        # (rare, but can happen if data is incomplete)
        relevant_teams = df[(df['GAME_ID'] == game_id) & (~df['TEAM_ID'].isin(box_df['TEAM_ID'].unique()))]['TEAM_ID'].unique()
        for tid in relevant_teams:
            missing_starters_dict[(game_id, tid)] = 0

    return missing_starters_dict


In [None]:
DEFAULT_FEATURE_COLS = [
    'PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'EFF_AVG_LAST_5', 'TS_PCT_AVG_LAST_5',
    'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'HOME_GAME', 'REST_DAYS',
    'PTS_AVG_LAST_5', 'REB_AVG_LAST_5', 'AST_AVG_LAST_5', 'FG_PCT_AVG_LAST_5',
    'MIN_AVG_LAST_5', 'OFF_RATING_AVG_LAST_5', 'PACE_PER40_AVG_LAST_5', 'PTS_SEASON_AVG', 
    'OPPONENT_POSITION_ALLOWED_PTS', 'TEAM_VS_OPP_ALLOWED_PTS',
    'PTS_VOL_LAST_5', 'USG_PCT_VOL_LAST_5', 'MIN_VOL_LAST_5',
    'TEAM_STARTERS_MISSING', 'OPP_STARTERS_MISSING',
]

In [None]:
##########################################################
# Below is the full code snippet that puts it all together
##########################################################
season = '2024-25'
opponent_stats = get_opponent_stats(season)
team_map = get_team_abbreviation_id_mapping()

all_player_data = pd.DataFrame()
for p_name in player_names:
    p_id = get_player_id(p_name)
    if not p_id:
        continue
    p_gamelog = get_player_game_logs(p_id, season)
    adv_stats = get_player_advanced_stats(p_id, season)
    if p_gamelog.empty or adv_stats.empty:
        continue
    
    p_gamelog['PLAYER_NAME'] = p_name
    merged_df = feature_engineering(p_gamelog, adv_stats, opponent_stats, team_map)
    all_player_data = pd.concat([all_player_data, merged_df], ignore_index=True)

# 3) Position & Team aggregator
all_player_data = add_opponent_position_allowed_pts(all_player_data)
all_player_data = add_team_vs_opponent_allowed_pts(all_player_data)

In [None]:
# 3) Build the dictionary of top-5 starters for each team
unique_team_ids = all_player_data['TEAM_ID'].unique().tolist()
team_starters_dict = build_team_starters_dict(unique_team_ids, season='2024-25')

# 4) Compute how many starters are missing for each game/team
missing_starters_dict = compute_starters_missing_for_games(
    all_player_data, team_starters_dict, season='2024-25'
)

# 5) Add the new columns
all_player_data = add_starters_missing_features(all_player_data, missing_starters_dict)


In [None]:
all_player_data

In [None]:
# 6) Now do train/test split & modeling as usual
X_train_scaled, X_test_scaled, y_train, y_test, X_test_original = prepare_data(all_player_data)
best_model = train_and_evaluate_models(X_train_scaled, y_train, X_test_scaled, y_test)
eval_df = evaluate_model(best_model, X_test_scaled, y_test, X_test_original)

# Your further steps, such as predictions for upcoming games, residual analysis, etc.

In [None]:
all_player_data.to_csv('data/all_player_data.csv', index=False)

-----