
Use Python and the NBA API to develop advanced machine learning model that predicts player performance metrics in upcoming game

______
-----------------------------


<h3 style="color:black;font-family:'Segoe UI Variable Display';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;font-weight:300;line-height:1;">Part 2.0</h3>
<h3 style="color:black;font-family:'Notes from Paris';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;line-height:1;">Part 2.0</h3>
<h3 style="color:black;font-family:'Juicy Advice Outline';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Mencken Std';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Digital-7';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Proxima Nova';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Barlow Condensed';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>


<h3 style="color:black;font-family:'Lazy Crunch';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Abril Display';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;">Part 2.0</h3>



In [1]:
import os
import time
import joblib
import warnings
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from nba_api.stats.endpoints import (
    playergamelog,
    boxscoreadvancedv2,
    leaguedashteamstats,
    scoreboardv2,
    commonplayerinfo,
)
from nba_api.stats.static import players, teams

from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

warnings.filterwarnings("ignore")

Developing a machine learning model to predict NBA player performance metrics like points involves several steps:

Data Collection: Gather historical and current season data using the NBA API, including advanced statistics such as Player Impact Estimate (PIE), Efficiency (EFF), Player Efficiency Rating (PER), trends, opponent data, and more.

Data Preprocessing: Clean and preprocess the data to prepare it for modeling.

Feature Engineering: Create features that capture the important aspects influencing player performance.

Model Training: Choose and train a suitable machine learning model.

Model Evaluation: Assess the model's performance and fine-tune as necessary.

Prediction: Use the trained model to predict future player performance.

----------------

<h3 style="color:black;font-family:'Juicy Advice';font-size:40px;text-shadow:0.25px 0.25px 0.25px black;margin:0;">Part 1. Data Collection</h3>


1. Utility functions

In [2]:
# Function to get player ID from name
def get_player_id(player_name):
    nba_players = players.get_players()
    player = next((p for p in nba_players if p['full_name'].lower() == player_name.lower()), None)
    if player:
        return player['id']
    print(f"No player found with name {player_name}")
    return None

# Function to get team abbreviation to ID mapping
def get_team_abbreviation_id_mapping():
    """Create a mapping from team abbreviations to team IDs."""
    nba_teams = teams.get_teams()
    return {team['abbreviation']: team['id'] for team in nba_teams}

# Function to get the player's current team ID
def get_player_team_id(player_id):
    """Fetch the player's current team ID using the commonplayerinfo endpoint."""
    try:
        player_info = commonplayerinfo.CommonPlayerInfo(player_id=player_id).get_data_frames()[0]
        team_id = player_info['TEAM_ID'].iloc[0]
        return int(team_id)
    except Exception as e:
        print(f"Error fetching team ID for player {player_id}: {e}")
        return None

def get_team_name(team_id):
    nba_teams = teams.get_teams()
    team = next((team for team in nba_teams if team['id'] == team_id), None)
    return team['full_name'] if team else 'Unknown Team'


-----------------------------------

2. Data Fetching Functions

In [3]:
# Function to get player game logs
def get_player_game_logs(player_id, season='2024-25'):
    """Fetch player game logs for the season."""
    try:
        gamelog = playergamelog.PlayerGameLog(player_id=player_id, season=season, timeout=60)
        gamelog_df = gamelog.get_data_frames()[0]
        # Convert all column names to uppercase
        gamelog_df.columns = gamelog_df.columns.str.upper()
        return gamelog_df
    except Exception as e:
        print(f"Error fetching game logs for player {player_id}: {e}")
        return pd.DataFrame()

# Function to get player advanced stats
def get_player_advanced_stats(player_id, season='2024-25'):
    """Fetch advanced stats for all games played by the player in the specified season."""
    gamelog_df = get_player_game_logs(player_id, season)
    advanced_stats_list = []

    # Loop through each game and fetch advanced stats
    for game_id in gamelog_df['GAME_ID']:
        success = False
        attempts = 0
        while not success and attempts < 3:
            try:
                boxscore = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id, timeout=60)
                player_stats = boxscore.player_stats.get_data_frame()
                # Filter stats for the specific player
                player_adv_stats = player_stats[player_stats['PLAYER_ID'] == int(player_id)]
                advanced_stats_list.append(player_adv_stats)
                success = True
            except Exception as e:
                attempts += 1
                print(f"Error fetching advanced stats for game {game_id}: {e}")
                time.sleep(2)
                continue
        time.sleep(1)  # Delay between requests

    if advanced_stats_list:
        # Concatenate all game stats into a single DataFrame
        advanced_stats_df = pd.concat(advanced_stats_list, ignore_index=True)
        
       #'START_POSITION', 'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING', 
       # 'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 
       # 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'E_USG_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS'
       
        advanced_stats_df = advanced_stats_df[['GAME_ID', 'PLAYER_ID', 'USG_PCT', 'PIE', 'TEAM_ID', 
                                               'OFF_RATING', 'PACE_PER40']]
        return advanced_stats_df
    else:
        return pd.DataFrame()

# Function to get opponent stats
def get_opponent_stats(season='2024-25'):
    """Fetch defensive stats for all teams."""
    try:
        team_stats = leaguedashteamstats.LeagueDashTeamStats(
            season=season,
            measure_type_detailed_defense='Defense',
            per_mode_detailed='PerGame',
            timeout=60).get_data_frames()[0]
        return team_stats[[
            'TEAM_ID', 'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE'
        ]]
    except Exception as e:
        print(f"Error fetching opponent stats: {e}")
        return pd.DataFrame()


Expand the Dataset: Include data from multiple players to enhance the model's generalizability.

Model Selection: Experiment with different algorithms like LightGBM, CatBoost, or ensemble methods to potentially improve performance.

Deployment: Set up a pipeline that updates the data and retrains the model regularly.

--------------------

3. Feature Engineering Functions

In [4]:
# Function to compute efficiency (EFF)
def compute_efficiency(player_gamelog):
    """Compute EFF for each game in the game log."""
    # Ensure all required columns are present
    # Calculate EFF
    player_gamelog['EFF'] = (player_gamelog['PTS'] + player_gamelog['REB'] +
                             player_gamelog['AST'] + player_gamelog['STL'] +
                             player_gamelog['BLK'] -
                             (player_gamelog['FGA'] - player_gamelog['FGM']) -
                             (player_gamelog['FTA'] - player_gamelog['FTM']) -
                             player_gamelog['TOV'])
    return player_gamelog

# Function to compute true shooting percentage (TS_PCT)
def compute_true_shooting_percentage(player_gamelog):
    """Compute TS_PCT for each game in the game log."""
    # Ensure all required columns are present
    # Avoid division by zero
    player_gamelog['TS_DENOM'] = 2 * (player_gamelog['FGA'] +
                                      0.44 * player_gamelog['FTA'])
    player_gamelog['TS_PCT'] = player_gamelog.apply(
        lambda row: row['PTS'] / row['TS_DENOM']
        if row['TS_DENOM'] != 0 else 0,
        axis=1)
    player_gamelog.drop(columns=['TS_DENOM'], inplace=True)
    return player_gamelog


In [5]:
def feature_engineering(player_gamelog, advanced_stats, opponent_stats, team_abbrev_to_id):
    """Generate features using advanced stats and opponent data."""
    # Compute EFF and TS_PCT
    player_gamelog = compute_efficiency(player_gamelog)
    player_gamelog = compute_true_shooting_percentage(player_gamelog)

    # Merge advanced stats into game logs on 'GAME_ID' and 'PLAYER_ID'
    player_data = pd.merge(
        player_gamelog,
        advanced_stats,
        on=['GAME_ID', 'PLAYER_ID'],
        how='left'
    )

    # Extract opponent team abbreviation
    player_data['OPPONENT_ABBREVIATION'] = player_data['MATCHUP'].apply(lambda x: x.split(' ')[-1])
    # Map team abbreviations to team IDs
    player_data['OPPONENT_TEAM_ID'] = player_data['OPPONENT_ABBREVIATION'].map(team_abbrev_to_id)

    if opponent_stats is not None:
        # Merge opponent defensive stats
        player_data = pd.merge(
            player_data,
            opponent_stats,
            left_on='OPPONENT_TEAM_ID',
            right_on='TEAM_ID',
            how='left'
        )
        # Handle missing values in opponent stats
        opponent_stat_cols = ['DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']
        player_data[opponent_stat_cols] = player_data[opponent_stat_cols].fillna(player_data[opponent_stat_cols].mean())

    # Calculate recent performance trends
    player_data = player_data.sort_values(['PLAYER_NAME', 'GAME_DATE']).reset_index(drop=True)

    # Convert 'MIN' to numeric
    player_data['MIN'] = player_data['MIN'].apply(
        lambda x: float(x.split(':')[0]) + float(x.split(':')[1]) / 60
        if isinstance(x, str) else x
    )

    # Shooting efficiency
    player_data['FG_PCT'] = player_data['FGM'] / player_data['FGA']

    # Rolling averages for advanced stats with shift
    rolling_stats = [
        'PIE', 'USG_PCT', 'PTS', 'REB', 'AST', 'EFF',
        'TS_PCT', 'MIN', 'FG_PCT', 'OFF_RATING', 'PACE_PER40'
    ]
    for stat in rolling_stats:
        player_data[f'{stat}_AVG_LAST_5'] = player_data.groupby('PLAYER_NAME')[stat].transform(
            lambda x: x.shift(0).rolling(window=5, min_periods=1).mean()
         )
        
    # Calculate season averages
    cumulative_stats = ['PTS']
    for stat in cumulative_stats:
        player_data[f'{stat}_SEASON_AVG'] = player_data.groupby('PLAYER_NAME')[stat].transform(
            lambda x: x.shift(1).expanding().mean()
        )
        
        
    # Indicator for home games
    player_data['HOME_GAME'] = player_data['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)

    # Calculate rest days between games
    player_data['GAME_DATE'] = pd.to_datetime(player_data['GAME_DATE'])
    player_data['REST_DAYS'] = player_data.groupby('PLAYER_NAME')['GAME_DATE'].diff().dt.days.fillna(0)

    # Drop unnecessary columns
    columns_to_drop = ['TEAM_ID_y', 'TEAM_ID_x']
    player_data = player_data.drop(columns=columns_to_drop, errors='ignore')

    # Handle missing values
    player_data = player_data.fillna(method='ffill').fillna(method='bfill')

    return player_data


---------------------------------

4. Data Preparation Functions

In [6]:
# Function to prepare data for training
def prepare_data(player_data):
    """Prepare features and target variables."""
    features = player_data[[
        'PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'EFF_AVG_LAST_5',
        'TS_PCT_AVG_LAST_5', 'DEF_RATING', 'OPP_PTS_OFF_TOV',
        'OPP_PTS_2ND_CHANCE', 'HOME_GAME', 'REST_DAYS', 'PTS_AVG_LAST_5',
        'REB_AVG_LAST_5', 'AST_AVG_LAST_5', 'FG_PCT_AVG_LAST_5',
        'MIN_AVG_LAST_5', 
        'OFF_RATING_AVG_LAST_5', 'PACE_PER40_AVG_LAST_5',
        'PTS_SEASON_AVG'
    ]]

    # Include PLAYER_NAME for grouping if needed
    features['PLAYER_NAME'] = player_data['PLAYER_NAME']
    target = player_data['PTS']

    # Split the data into training and testing sets without shuffling
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, shuffle=False
    )

    # Feature scaling (excluding PLAYER_NAME)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.drop(columns=['PLAYER_NAME']))
    X_test_scaled = scaler.transform(X_test.drop(columns=['PLAYER_NAME']))

    # Store the scaler for future use
    joblib.dump(scaler, 'lib/scaler.pkl')

    return X_train_scaled, X_test_scaled, y_train, y_test, X_test['PLAYER_NAME'].values


----------------------------

5. Model Training and Evaluation

In [7]:
def train_model(X_train, y_train):
    """Train the model with time series cross-validation and hyperparameter tuning."""
    # Define parameter grid for XGBoost
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    xgb_model = XGBRegressor(random_state=42)

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=tscv,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best RMSE: {np.sqrt(-grid_search.best_score_):.2f}")
    best_model = grid_search.best_estimator_
    return best_model

def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    """Train and evaluate different models."""
    models = {
        'XGBoost': XGBRegressor(random_state=42),
        'LightGBM': LGBMRegressor(random_state=42),
        'CatBoost': CatBoostRegressor(random_state=42, verbose=0)
    }

    best_model = None
    best_rmse = float('inf')

    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        print(f"{name} Performance:")
        print(f"  RMSE: {rmse:.2f}")
        print(f"  MAE: {mae:.2f}")
        print(f"  R2 Score: {r2:.2f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model

    print(f"\nBest model: {type(best_model).__name__} with RMSE: {best_rmse:.2f}")
    return best_model


6. Model Evaluation

In [8]:
# Function to evaluate the model
def evaluate_model(model, X_test, y_test, player_names):
    """Evaluate the model's performance on the test set."""
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f"\nEvaluation on Test Data:")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R2 Score: {r2:.2f}")

    # Create a DataFrame for detailed evaluation
    eval_df = pd.DataFrame({
        'PLAYER_NAME': player_names,
        'Actual_PTS': y_test,
        'Predicted_PTS': predictions
    })
    print("\nSample Predictions:")
    print(eval_df.head(10))

    return predictions


--------------------------------

7. Model Prediction Functions


In [9]:
# Function to get team defensive stats
def get_team_defensive_stats(team_id, season='2024-25'):
    """Fetch defensive stats for a specific team."""
    try:
        team_stats = leaguedashteamstats.LeagueDashTeamStats(
            team_id_nullable=team_id,
            season=season,
            measure_type_detailed_defense='Defense',
            per_mode_detailed='PerGame',
            timeout=60
        ).get_data_frames()[0]
        return team_stats[['TEAM_ID', 'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']].iloc[0]
    except Exception as e:
        print(f"Error fetching stats for team {team_id}: {e}")
        return None


In [10]:
# Function to get the next game information
def get_next_game_info(player_team_id):
    """Fetch the next game date and opponent team ID for the player's team."""
    # Start from tomorrow
    next_game_date = datetime.now() + timedelta(days=1)
    max_days_ahead = 14  # Extend the search to the next 14 days

    for _ in range(max_days_ahead):
        game_date_str = next_game_date.strftime('%Y-%m-%d')
        try:
            scoreboard = scoreboardv2.ScoreboardV2(game_date=game_date_str)
            games = scoreboard.game_header.get_data_frame()
            # Filter games involving the player's team
            team_games = games[(games['HOME_TEAM_ID'] == player_team_id) | (games['VISITOR_TEAM_ID'] == player_team_id)]
            if not team_games.empty:
                next_game = team_games.iloc[0]
                opponent_team_id = next_game['VISITOR_TEAM_ID'] if next_game['HOME_TEAM_ID'] == player_team_id else next_game['HOME_TEAM_ID']
                # Determine if it's a home game
                home_game = 1 if next_game['HOME_TEAM_ID'] == player_team_id else 0
                return next_game_date, opponent_team_id, home_game
        except Exception as e:
            print(f"Error fetching games for {game_date_str}: {e}")
        
        # Increment the date by one day
        next_game_date += timedelta(days=1)
    
    print("No upcoming games found in the next 14 days.")
    return None, None, None


In [11]:
def prepare_features_for_prediction(player_id, player_name, season='2024-25'):
    """Prepare features for predicting the player's next game."""
    # Fetch the latest game logs
    player_gamelog = get_player_game_logs(player_id, season)
    
    # Sort game logs in descending order to have the most recent game first
    player_gamelog['GAME_DATE'] = pd.to_datetime(player_gamelog['GAME_DATE'])
    player_gamelog = player_gamelog.sort_values('GAME_DATE', ascending=False)
    player_gamelog['PLAYER_NAME'] = player_name  # Add PLAYER_NAME

    # Get the player's current team ID
    player_team_id = get_player_team_id(player_id)

    # Get the next game date and opponent team ID
    next_game_date, opponent_team_id, home_game = get_next_game_info(player_team_id)

    # Fetch opponent stats
    opponent_stats = get_team_defensive_stats(opponent_team_id, season)

    # Prepare player data up to the most recent game
    advanced_stats = get_player_advanced_stats(player_id, season)
    team_abbrev_to_id = get_team_abbreviation_id_mapping()

    # Use only games up to the most recent game date
    latest_game_date = player_gamelog['GAME_DATE'].iloc[0]
    player_data = player_gamelog[player_gamelog['GAME_DATE'] <= latest_game_date]
    advanced_stats = advanced_stats[advanced_stats['GAME_ID'].isin(player_data['GAME_ID'])]

    # Feature engineering without merging opponent stats
    player_data = feature_engineering(player_data, advanced_stats, None, team_abbrev_to_id)

    # Prepare the latest data point for prediction
    latest_data = player_data.iloc[-1].copy()
    print(latest_data)

    # Update features for the next game
    latest_data['REST_DAYS'] = (next_game_date - pd.to_datetime(latest_data['GAME_DATE'])).days
    latest_data['GAME_DATE'] = next_game_date  # Update to next game date

    # Update HOME_GAME indicator
    latest_data['HOME_GAME'] = home_game

    # Update 'OPPONENT_TEAM_ID' and 'OPPONENT_ABBREVIATION' to the upcoming opponent
    latest_data['OPPONENT_TEAM_ID'] = opponent_team_id
    latest_data['OPPONENT_ABBREVIATION'] = get_team_name(opponent_team_id)

    # Update opponent-specific features with upcoming opponent's stats
    latest_data['DEF_RATING'] = opponent_stats['DEF_RATING']
    latest_data['OPP_PTS_OFF_TOV'] = opponent_stats['OPP_PTS_OFF_TOV']
    latest_data['OPP_PTS_2ND_CHANCE'] = opponent_stats['OPP_PTS_2ND_CHANCE']

    # Prepare feature vector
    features = latest_data[[
        'PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'EFF_AVG_LAST_5',
        'TS_PCT_AVG_LAST_5', 'DEF_RATING', 'OPP_PTS_OFF_TOV',
        'OPP_PTS_2ND_CHANCE', 'HOME_GAME', 'REST_DAYS', 'PTS_AVG_LAST_5',
        'REB_AVG_LAST_5', 'AST_AVG_LAST_5', 'FG_PCT_AVG_LAST_5',
        'MIN_AVG_LAST_5', 
        'OFF_RATING_AVG_LAST_5',
        'PACE_PER40_AVG_LAST_5', 'PTS_SEASON_AVG'
    ]]

    return features.values.reshape(1, -1), latest_data





In [12]:
# Function to predict upcoming points
def predict_upcoming_points(player_name, season='2024-25'):
    player_id = get_player_id(player_name)
    # Prepare features for prediction
    feature_vector, latest_data = prepare_features_for_prediction(player_id, player_name, season)
    # Load the scaler and model
    try:
        scaler = joblib.load('lib/scaler.pkl')
        model = joblib.load('lib/player_points_model.pkl')
    except Exception as e:
        print(f"Error loading model or scaler: {e}")
        return

    # Scale the features
    feature_vector_scaled = scaler.transform(feature_vector)
    # Make prediction
    predicted_points = model.predict(feature_vector_scaled)

    # Retrieve the game date and opponent team ID
    game_date = latest_data['GAME_DATE'].strftime('%Y-%m-%d')
    opponent_team_id = latest_data['OPPONENT_TEAM_ID']
    opponent_team_name = get_team_name(opponent_team_id)

    print(f"Predicted points for {player_name} in the upcoming game on {game_date} against {opponent_team_name}: {predicted_points[0]:.2f}")

    return predicted_points[0]

--------

-------

8. Feature Importance



In [13]:
def plot_feature_importance(model, feature_names):
    """Plot the feature importance."""
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values(by='importance', ascending=False)

    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.xlabel('Importance Score')
    plt.ylabel('Features')
    plt.show()



------

In [14]:
# Define the list of player names
player_names = [
    "LeBron James",
    "Kevin Durant",
    "Stephen Curry",
    "Giannis Antetokounmpo",
    "Luka Dončić",
    "Joel Embiid",
    "Jayson Tatum",
    "Nikola Jokić",
    "Shai Gilgeous-Alexander",
    "Karl-Anthony Towns",
    "Victor Wembanyama",
    "Damian Lillard",
    "Donovan Mitchell",
    "Anthony Davis",
    "Domantas Sabonis",
    "James Harden",
    "Kyrie Irving", "Anthony Edwards", "Jimmy Butler",
    # Add more player names as desired
]

# Initialize empty DataFrame to store combined data
all_player_data = pd.DataFrame()

# Fetch opponent stats once (since it's common for all players)
season = '2024-25'
opponent_stats = get_opponent_stats(season)
team_abbrev_to_id = get_team_abbreviation_id_mapping()

# Loop through each player and fetch their data
for player_name in player_names:
    player_id = get_player_id(player_name)
    if not player_id:
        continue  # Skip if player ID not found

    # Fetch game logs
    player_gamelog = get_player_game_logs(player_id, season)
    advanced_stats = get_player_advanced_stats(player_id, season)
    # Add PLAYER_NAME to the DataFrame
    player_gamelog['PLAYER_NAME'] = player_name

    # Feature engineering
    player_data = feature_engineering(player_gamelog, advanced_stats, opponent_stats, team_abbrev_to_id)
    # Add to the combined DataFrame
    all_player_data = pd.concat([all_player_data, player_data], ignore_index=True)


# Prepare data
X_train_scaled, X_test_scaled, y_train, y_test, player_names_test = prepare_data(all_player_data)

# Train and evaluate models
best_model = train_and_evaluate_models(X_train_scaled, y_train, X_test_scaled, y_test)
# Save the best model
joblib.dump(best_model, 'lib/player_points_model.pkl')

# Evaluate the best model
evaluate_model(best_model, X_test_scaled, y_test, player_names_test)



Training XGBoost...
XGBoost Performance:
  RMSE: 6.89
  MAE: 5.31
  R2 Score: 0.30

Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 997
[LightGBM] [Info] Number of data points in the train set: 268, number of used features: 17
[LightGBM] [Info] Start training from score 25.723881
LightGBM Performance:
  RMSE: 6.88
  MAE: 5.73
  R2 Score: 0.30

Training CatBoost...
CatBoost Performance:
  RMSE: 6.68
  MAE: 5.38
  R2 Score: 0.34

Best model: CatBoostRegressor with RMSE: 6.68

Evaluation on Test Data:
  RMSE: 6.68
  MAE: 5.38
  R2 Score: 0.34

Sample Predictions:
      PLAYER_NAME  Actual_PTS  Predicted_PTS
268  James Harden          20      25.093952
269  James Harden          12      15.842785
270  James Harden          24      21.400862
271  James Harden          22      25.534822
272  James Harden          23      22.80935

array([25.09395201, 15.84278498, 21.40086171, 25.53482165, 22.80935072,
       23.0342528 , 34.95179246, 31.1144176 , 31.37306976, 30.46006167,
       29.55479457, 25.06431975, 25.01639604, 13.2775928 , 15.2775235 ,
       18.03243337, 21.00963891, 24.58182799, 29.90293838, 29.19239094,
       26.47623003, 29.75225345, 27.52603218, 24.23127772, 27.09113434,
       26.60464836, 25.55590283, 26.55978757, 20.26155996, 22.07565967,
       22.69723923, 27.1003364 , 25.27369206, 15.34601257, 17.21214556,
       17.98568278, 24.33740715, 29.6027623 , 30.39514887, 27.90097319,
       27.90428644, 29.88142441, 30.29890569, 27.39942967, 26.99090504,
       24.02752673, 22.21934328, 25.45245895, 24.41005272, 26.94368552,
       28.09924292, 26.93290268, 28.96423559, 22.35065071, 24.37653391,
       27.1141497 , 22.17793788, 16.58882347, 21.25578012, 25.08671599,
       27.20891729, 23.8786166 , 28.34886339, 22.77590872, 23.67211231,
       21.36009393, 21.03719161])

In [15]:
# Predict upcoming points for each player
for player_name in player_names:
    print("\n---")
    predict_upcoming_points(player_name, season)


---
SEASON_ID                              22024
PLAYER_ID                               2544
GAME_ID                           0022400318
GAME_DATE                2024-12-02 00:00:00
MATCHUP                            LAL @ MIN
WL                                         L
MIN                                       31
FGM                                        4
FGA                                       16
FG_PCT                                  0.25
FG3M                                       0
FG3A                                       4
FG3_PCT                                  0.0
FTM                                        2
FTA                                        4
FT_PCT                                   0.5
OREB                                       3
DREB                                       5
REB                                        8
AST                                        4
STL                                        0
BLK                                        0
TOV  