
Use Python and the NBA API to develop advanced machine learning model that predicts player performance metrics in upcoming game

______
-----------------------------


<h3 style="color:black;font-family:'Segoe UI Variable Display';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;font-weight:300;line-height:1;">Part 2.0</h3>
<h3 style="color:black;font-family:'Notes from Paris';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;line-height:1;">Part 2.0</h3>
<h3 style="color:black;font-family:'Juicy Advice Outline';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Mencken Std';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Digital-7';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Proxima Nova';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Barlow Condensed';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>


<h3 style="color:black;font-family:'Lazy Crunch';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;line-height:1;margin:0;">Part 2.0</h3>
<h3 style="color:black;font-family:'Abril Display';font-size:40px;text-shadow:0.125px 0.25px 0.25px black;margin:0;">Part 2.0</h3>



In [12]:
import os
import time
import joblib
import warnings
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from nba_api.stats.endpoints import (
    playergamelog,
    boxscoreadvancedv2,
    leaguedashteamstats,
    scoreboardv2,
    commonplayerinfo,
)
from nba_api.stats.static import players, teams

from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

warnings.filterwarnings("ignore")

Developing a machine learning model to predict NBA player performance metrics like points involves several steps:

Data Collection: Gather historical and current season data using the NBA API, including advanced statistics such as Player Impact Estimate (PIE), Efficiency (EFF), Player Efficiency Rating (PER), trends, opponent data, and more.

Data Preprocessing: Clean and preprocess the data to prepare it for modeling.

Feature Engineering: Create features that capture the important aspects influencing player performance.

Model Training: Choose and train a suitable machine learning model.

Model Evaluation: Assess the model's performance and fine-tune as necessary.

Prediction: Use the trained model to predict future player performance.

-----


<h3 style="color:black;font-family:'Juicy Advice';font-size:40px;text-shadow:0.25px 0.25px 0.25px black;margin:0;">Part 1. Data Collection</h3>


1. Utility functions

In [13]:

# Function to get player ID from name
def get_player_id(player_name):
    nba_players = players.get_players()
    player = next((p for p in nba_players if p['full_name'].lower() == player_name.lower()), None)
    return player['id'] if player else None

# Function to get team abbreviation to ID mapping
def get_team_abbreviation_id_mapping():
    nba_teams = teams.get_teams()
    return {team['abbreviation']: team['id'] for team in nba_teams}

# Function to get the player's current team ID
def get_player_team_id(player_id):
    try:
        player_info = commonplayerinfo.CommonPlayerInfo(player_id=player_id).get_data_frames()[0]
        return int(player_info['TEAM_ID'].iloc[0])
    except Exception as e:
        print(f"Error fetching team ID for player {player_id}: {e}")
        return None

def get_team_name(team_id):
    nba_teams = teams.get_teams()
    team = next((team for team in nba_teams if team['id'] == team_id), None)
    return team['full_name'] if team else 'Unknown Team'


-----------------------------------

2. Data Fetching Functions

In [14]:

# Function to get player game logs
def get_player_game_logs(player_id, season='2024-25'):
    try:
        gamelog = playergamelog.PlayerGameLog(player_id=player_id, season=season, timeout=60)
        gamelog_df = gamelog.get_data_frames()[0]
        gamelog_df.columns = gamelog_df.columns.str.upper()
        return gamelog_df
    except Exception as e:
        print(f"Error fetching game logs for player {player_id}: {e}")
        return pd.DataFrame()

# Function to get player advanced stats
def get_player_advanced_stats(player_id, season='2024-25'):
    gamelog_df = get_player_game_logs(player_id, season)
    advanced_stats_list = []

    for game_id in gamelog_df['GAME_ID']:
        for _ in range(3):
            try:
                boxscore = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id, timeout=60)
                player_stats = boxscore.player_stats.get_data_frame()
                player_adv_stats = player_stats[player_stats['PLAYER_ID'] == int(player_id)]
                advanced_stats_list.append(player_adv_stats)
                break
            except Exception as e:
                print(f"Error fetching advanced stats for game {game_id}: {e}")
                time.sleep(2)
        time.sleep(1)

    if advanced_stats_list:
        advanced_stats_df = pd.concat(advanced_stats_list, ignore_index=True)
        return advanced_stats_df[['GAME_ID', 'PLAYER_ID', 'USG_PCT', 'PIE', 'TEAM_ID', 'OFF_RATING', 'PACE_PER40']]
    return pd.DataFrame()

# Function to get opponent stats
def get_opponent_stats(season='2024-25'):
    try:
        team_stats = leaguedashteamstats.LeagueDashTeamStats(
            season=season,
            measure_type_detailed_defense='Defense',
            per_mode_detailed='PerGame',
            timeout=60).get_data_frames()[0]
        return team_stats[['TEAM_ID', 'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']]
    except Exception as e:
        print(f"Error fetching opponent stats: {e}")
        return pd.DataFrame()


--------------------

3. Feature Engineering Functions

In [15]:

# Function to compute efficiency (EFF)
def compute_efficiency(player_gamelog):
    player_gamelog['EFF'] = (
        player_gamelog['PTS'] + player_gamelog['REB'] +
        player_gamelog['AST'] + player_gamelog['STL'] +
        player_gamelog['BLK'] -
        (player_gamelog['FGA'] - player_gamelog['FGM']) -
        (player_gamelog['FTA'] - player_gamelog['FTM']) -
        player_gamelog['TOV']
    )
    return player_gamelog

# Function to compute true shooting percentage (TS_PCT)
def compute_true_shooting_percentage(player_gamelog):
    player_gamelog['TS_DENOM'] = 2 * (player_gamelog['FGA'] + 0.44 * player_gamelog['FTA'])
    player_gamelog['TS_PCT'] = player_gamelog.apply(
        lambda row: row['PTS'] / row['TS_DENOM'] if row['TS_DENOM'] != 0 else 0,
        axis=1
    )
    player_gamelog.drop(columns=['TS_DENOM'], inplace=True)
    return player_gamelog

def feature_engineering(player_gamelog, advanced_stats, opponent_stats, team_abbrev_to_id):
    player_gamelog = compute_efficiency(player_gamelog)
    player_gamelog = compute_true_shooting_percentage(player_gamelog)

    player_data = pd.merge(
        player_gamelog,
        advanced_stats,
        on=['GAME_ID', 'PLAYER_ID'],
        how='left'
    )

    player_data['OPPONENT_ABBREVIATION'] = player_data['MATCHUP'].apply(lambda x: x.split(' ')[-1])
    player_data['OPPONENT_TEAM_ID'] = player_data['OPPONENT_ABBREVIATION'].map(team_abbrev_to_id)

    if opponent_stats is not None:
        player_data = pd.merge(
            player_data,
            opponent_stats,
            left_on='OPPONENT_TEAM_ID',
            right_on='TEAM_ID',
            how='left'
        )
        opponent_stat_cols = ['DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']
        player_data[opponent_stat_cols] = player_data[opponent_stat_cols].fillna(player_data[opponent_stat_cols].mean())

    player_data = player_data.sort_values(['PLAYER_NAME', 'GAME_DATE']).reset_index(drop=True)

    player_data['MIN'] = player_data['MIN'].apply(
        lambda x: float(x.split(':')[0]) + float(x.split(':')[1]) / 60 if isinstance(x, str) else x
    )

    player_data['FG_PCT'] = player_data['FGM'] / player_data['FGA']

    rolling_stats = [
        'PIE', 'USG_PCT', 'PTS', 'REB', 'AST', 'EFF',
        'TS_PCT', 'MIN', 'FG_PCT', 'OFF_RATING', 'PACE_PER40'
    ]
    for stat in rolling_stats:
        player_data[f'{stat}_AVG_LAST_5'] = player_data.groupby('PLAYER_NAME')[stat].transform(
            lambda x: x.shift(0).rolling(window=5, min_periods=1).mean()
        )

    cumulative_stats = ['PTS']
    for stat in cumulative_stats:
        player_data[f'{stat}_SEASON_AVG'] = player_data.groupby('PLAYER_NAME')[stat].transform(
            lambda x: x.shift(1).expanding().mean()
        )

    player_data['HOME_GAME'] = player_data['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)

    player_data['GAME_DATE'] = pd.to_datetime(player_data['GAME_DATE'])
    player_data['REST_DAYS'] = player_data.groupby('PLAYER_NAME')['GAME_DATE'].diff().dt.days.fillna(0)

    columns_to_drop = ['TEAM_ID_y', 'TEAM_ID_x']
    player_data = player_data.drop(columns=columns_to_drop, errors='ignore')

    player_data = player_data.fillna(method='ffill').fillna(method='bfill')
    

    return player_data


---------------------------------

4. Data Preparation Functions

In [16]:
# Function to prepare data for training
def prepare_data(player_data):
    features = player_data[[
        'PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'EFF_AVG_LAST_5',
        'TS_PCT_AVG_LAST_5', 'DEF_RATING', 'OPP_PTS_OFF_TOV',
        'OPP_PTS_2ND_CHANCE', 'HOME_GAME', 'REST_DAYS', 'PTS_AVG_LAST_5',
        'REB_AVG_LAST_5', 'AST_AVG_LAST_5', 'FG_PCT_AVG_LAST_5',
        'MIN_AVG_LAST_5', 
        'OFF_RATING_AVG_LAST_5', 'PACE_PER40_AVG_LAST_5',
        'PTS_SEASON_AVG'
    ]]

    features['PLAYER_NAME'] = player_data['PLAYER_NAME']
    target = player_data['PTS']

    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, shuffle=False
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.drop(columns=['PLAYER_NAME']))
    X_test_scaled = scaler.transform(X_test.drop(columns=['PLAYER_NAME']))

    joblib.dump(scaler, 'lib/scaler.pkl')

    return X_train_scaled, X_test_scaled, y_train, y_test, X_test['PLAYER_NAME'].values


----------------------------

5. Model Training and Evaluation

In [17]:

def train_model(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        
        'reg_lambda': [1, 5, 10],
        'reg_alpha': [0, 0.5, 1],
    }
    xgb_model = XGBRegressor(random_state=42)

    tscv = TimeSeriesSplit(n_splits=5)

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=tscv,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best RMSE: {np.sqrt(-grid_search.best_score_):.2f}")
    return grid_search.best_estimator_

def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    models = {
        'XGBoost': XGBRegressor(random_state=42),
        'LightGBM': LGBMRegressor(random_state=42),
        'CatBoost': CatBoostRegressor(random_state=42, verbose=0)
    }

    best_model = None
    best_rmse = float('inf')

    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        print(f"{name} Performance:")
        print(f"  RMSE: {rmse:.2f}")
        print(f"  MAE: {mae:.2f}")
        print(f"  R2 Score: {r2:.2f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model

    print(f"\nBest model: {type(best_model).__name__} with RMSE: {best_rmse:.2f}")
    return best_model

# Function to evaluate the model
def evaluate_model(model, X_test, y_test, player_names):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f"\nEvaluation on Test Data:")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R2 Score: {r2:.2f}")

    eval_df = pd.DataFrame({
        'PLAYER_NAME': player_names,
        'Actual_PTS': y_test,
        'Predicted_PTS': predictions
    })
    print("\nSample Predictions:")
    print(eval_df.head(10))

    return predictions


--------------------------------

7. Model Prediction Functions

In [18]:
# Function to get team defensive stats
def get_team_defensive_stats(team_id, season='2024-25'):
    try:
        team_stats = leaguedashteamstats.LeagueDashTeamStats(
            team_id_nullable=team_id,
            season=season,
            measure_type_detailed_defense='Defense',
            per_mode_detailed='PerGame',
            timeout=60
        ).get_data_frames()[0]
        return team_stats[['TEAM_ID', 'DEF_RATING', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE']].iloc[0]
    except Exception as e:
        print(f"Error fetching stats for team {team_id}: {e}")
        return None

# Function to get the next game information
def get_next_game_info(player_team_id):
    next_game_date = datetime.now() + timedelta(days=1)
    max_days_ahead = 14

    for _ in range(max_days_ahead):
        game_date_str = next_game_date.strftime('%Y-%m-%d')
        try:
            scoreboard = scoreboardv2.ScoreboardV2(game_date=game_date_str)
            games = scoreboard.game_header.get_data_frame()
            team_games = games[(games['HOME_TEAM_ID'] == player_team_id) | (games['VISITOR_TEAM_ID'] == player_team_id)]
            if not team_games.empty:
                next_game = team_games.iloc[0]
                opponent_team_id = next_game['VISITOR_TEAM_ID'] if next_game['HOME_TEAM_ID'] == player_team_id else next_game['HOME_TEAM_ID']
                home_game = 1 if next_game['HOME_TEAM_ID'] == player_team_id else 0
                return next_game_date, opponent_team_id, home_game
        except Exception as e:
            print(f"Error fetching games for {game_date_str}: {e}")

        next_game_date += timedelta(days=1)

    print("No upcoming games found in the next 14 days.")
    return None, None, None

def prepare_features_for_prediction(player_id, player_name, season='2024-25'):
    player_gamelog = get_player_game_logs(player_id, season)
    player_gamelog['GAME_DATE'] = pd.to_datetime(player_gamelog['GAME_DATE'])
    player_gamelog = player_gamelog.sort_values('GAME_DATE', ascending=False)
    player_gamelog['PLAYER_NAME'] = player_name

    player_team_id = get_player_team_id(player_id)
    next_game_date, opponent_team_id, home_game = get_next_game_info(player_team_id)
    opponent_stats = get_team_defensive_stats(opponent_team_id, season)

    advanced_stats = get_player_advanced_stats(player_id, season)
    team_abbrev_to_id = get_team_abbreviation_id_mapping()

    latest_game_date = player_gamelog['GAME_DATE'].iloc[0]
    player_data = player_gamelog[player_gamelog['GAME_DATE'] <= latest_game_date]
    advanced_stats = advanced_stats[advanced_stats['GAME_ID'].isin(player_data['GAME_ID'])]

    player_data = feature_engineering(player_data, advanced_stats, None, team_abbrev_to_id)

    latest_data = player_data.iloc[-1].copy()
    print(latest_data)

    latest_data['REST_DAYS'] = (next_game_date - pd.to_datetime(latest_data['GAME_DATE'])).days
    latest_data['GAME_DATE'] = next_game_date

    latest_data['HOME_GAME'] = home_game

    latest_data['OPPONENT_TEAM_ID'] = opponent_team_id
    latest_data['OPPONENT_ABBREVIATION'] = get_team_name(opponent_team_id)

    latest_data['DEF_RATING'] = opponent_stats['DEF_RATING']
    latest_data['OPP_PTS_OFF_TOV'] = opponent_stats['OPP_PTS_OFF_TOV']
    latest_data['OPP_PTS_2ND_CHANCE'] = opponent_stats['OPP_PTS_2ND_CHANCE']

    features = latest_data[[
        'PIE_AVG_LAST_5', 'USG_PCT_AVG_LAST_5', 'EFF_AVG_LAST_5',
        'TS_PCT_AVG_LAST_5', 'DEF_RATING', 'OPP_PTS_OFF_TOV',
        'OPP_PTS_2ND_CHANCE', 'HOME_GAME', 'REST_DAYS', 'PTS_AVG_LAST_5',
        'REB_AVG_LAST_5', 'AST_AVG_LAST_5', 'FG_PCT_AVG_LAST_5',
        'MIN_AVG_LAST_5', 
        'OFF_RATING_AVG_LAST_5',
        'PACE_PER40_AVG_LAST_5', 'PTS_SEASON_AVG'
    ]]

    return features.values.reshape(1, -1), latest_data



In [19]:
# Function to predict upcoming points
def predict_upcoming_points(player_name, season='2024-25'):
    player_id = get_player_id(player_name)
    feature_vector, latest_data = prepare_features_for_prediction(player_id, player_name, season)
    try:
        scaler = joblib.load('lib/scaler.pkl')
        model = joblib.load('lib/player_points_model.pkl')
    except Exception as e:
        print(f"Error loading model or scaler: {e}")
        return

    feature_vector_scaled = scaler.transform(feature_vector)
    predicted_points = model.predict(feature_vector_scaled)

    game_date = latest_data['GAME_DATE'].strftime('%Y-%m-%d')
    opponent_team_id = latest_data['OPPONENT_TEAM_ID']
    opponent_team_name = get_team_name(opponent_team_id)

    print(f"Predicted points for {player_name} in the upcoming game on {game_date} against {opponent_team_name}: {predicted_points[0]:.2f}")

    return predicted_points[0]


-------

8. Feature Importance

-----

In [None]:
# Define the list of player names
player_names = [
    "LeBron James",
    "Kevin Durant",
    "Stephen Curry",
    "Giannis Antetokounmpo",
    "Luka Dončić",
    "Joel Embiid",
    "Jayson Tatum",
    "Nikola Jokić",
    "Shai Gilgeous-Alexander",
    "Karl-Anthony Towns",
    "Victor Wembanyama",
    "Damian Lillard",
    "Donovan Mitchell",
    "Anthony Davis",
    "Domantas Sabonis",
    "James Harden",
    "Kyrie Irving", "Anthony Edwards", "Jimmy Butler",
]

all_player_data = pd.DataFrame()
season = '2024-25'
opponent_stats = get_opponent_stats(season)
team_abbrev_to_id = get_team_abbreviation_id_mapping()

for player_name in player_names:
    player_id = get_player_id(player_name)
    if not player_id:
        continue

    player_gamelog = get_player_game_logs(player_id, season)
    advanced_stats = get_player_advanced_stats(player_id, season)
    player_gamelog['PLAYER_NAME'] = player_name

    player_data = feature_engineering(player_gamelog, advanced_stats, opponent_stats, team_abbrev_to_id)
    all_player_data = pd.concat([all_player_data, player_data], ignore_index=True)

X_train_scaled, X_test_scaled, y_train, y_test, player_names_test = prepare_data(all_player_data)

best_model = train_and_evaluate_models(X_train_scaled, y_train, X_test_scaled, y_test)
joblib.dump(best_model, 'lib/player_points_model.pkl')
evaluate_model(best_model, X_test_scaled, y_test, player_names_test)

In [None]:
for player_name in player_names:
    print("\n---")
    predict_upcoming_points(player_name, season)



---


KeyError: 'TEAM_ID'