In [17]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguedashplayerstats, leaguegamelog, boxscoretraditionalv2
from nba_api.stats.static import teams, players
from time import sleep
API_CALL_DELAY = 0.6

### Step 1: Data Collection with Player Impact Estimate (PIE) Calculation




The **Player Impact Estimate (PIE)** is an advanced metric to measure a player's overall statistical contribution against the total statistics in the games they play. It provides a holistic view of a player's impact, considering both positive and negative actions. The NBA's official definition of PIE is:

$$
\small
\text{PIE} = \frac{\text{Player's Stats} + \text{Team's Stats} - \text{OPPONENT's Stats}}{\text{TEAM's Total Stats} + \text{OPPONENT's Total Stats}}
$$




**1.1 Fetch Player Statistics & Game Logs**

Get basic statistics for all players in the season. 
We need all the games to compute team and opponent stats for PIE calculation.
Fetch detailed game logs for each player.

In [18]:
# Fetch player statistics and game logs for given season
def fetch_player_data(season='2024-25', season_type='Regular Season'):
    print("Fetching player statistics and game logs...")

    # Fetch player statistics
    player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season=season, season_type_all_star=season_type, per_mode_detailed='PerGame')
    df_player_stats = player_stats.get_data_frames()[0]

    # Fetch player game logs
    player_game_logs = leaguegamelog.LeagueGameLog(season=season, season_type_all_star=season_type, player_or_team_abbreviation='P')
    df_player_game_logs = player_game_logs.get_data_frames()[0]

    return df_player_stats, df_player_game_logs




**1.2 Fetch Team Game Stats**

Fetch team statistics for each game to get the totals required for PIE.

In [19]:
def fetch_team_stats(season='2024-25', season_type='Regular Season'):
    print("Fetching team statistics...")
    
    # Get game IDs from team game logs
    team_game_logs = leaguegamelog.LeagueGameLog(season=season, season_type_all_star=season_type, player_or_team_abbreviation='T')
    df_team_game_logs = team_game_logs.get_data_frames()[0]
    game_ids = df_team_game_logs['GAME_ID'].unique()

    df_team_stats = pd.DataFrame()
    for game_id in game_ids:
        sleep(API_CALL_DELAY)
        try:
            boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
            team_stats = boxscore.get_data_frames()[1]  # Team stats are in 2nd DataFrame
            df_team_stats = pd.concat([df_team_stats, team_stats], ignore_index=True)
        except Exception as e:
            print(f"Error fetching team stats for game {game_id}: {e}")
            continue

    return df_team_stats

In [20]:
df_player_stats, df_player_game_logs = fetch_player_data()
df_team_stats = fetch_team_stats()

Fetching player statistics and game logs...
Fetching team statistics...


Prepare data and calculate PIE for each player's game.

In [21]:
def prepare_and_calculate_pie(df_player_game_logs, df_team_stats):
    print("Preparing data and calculating PIE...")
    
    # Merge player logs with team stats
    df_player_team = pd.merge(df_player_game_logs, df_team_stats, on=['GAME_ID', 'TEAM_ID'], suffixes=('', '_TEAM'))

    # Prepare opponent team stats
    df_team_stats_opponent = df_team_stats.copy()
    df_team_stats_opponent.rename(columns={'TEAM_ID': 'OPPONENT_TEAM_ID'}, inplace=True)
    
    # Merge to get opponent stats
    df_player_full = pd.merge(df_player_team, df_team_stats_opponent, on='GAME_ID', suffixes=('', '_OPPONENT'))

    # Filter out own team stats
    df_player_full = df_player_full[df_player_full['TEAM_ID'] != df_player_full['OPPONENT_TEAM_ID']]

    # Calculate PIE components
    df_player_full['PIE_Numerator'] = (
        df_player_full['PTS'] + df_player_full['FGM'] - 
        df_player_full['FGA'] + df_player_full['FTM'] -
        df_player_full['FTA'] + df_player_full['OREB'] + 
        df_player_full['DREB'] + df_player_full['AST'] +
        df_player_full['STL'] + df_player_full['BLK'] - 
        df_player_full['PF'] - df_player_full['TOV']
    )
    
    # Calculate GAME Total Stats Denominator
    df_player_full['GAME_Total'] = (
        df_player_full['PTS_TEAM'] + df_player_full['PTS_OPPONENT'] +
        df_player_full['FGM_TEAM'] + df_player_full['FGM_OPPONENT'] -
        df_player_full['FGA_TEAM'] - df_player_full['FGA_OPPONENT'] +
        df_player_full['FTM_TEAM'] + df_player_full['FTM_OPPONENT'] -
        df_player_full['FTA_TEAM'] - df_player_full['FTA_OPPONENT'] +
        df_player_full['OREB_TEAM'] + df_player_full['OREB_OPPONENT'] +
        df_player_full['DREB_TEAM'] + df_player_full['DREB_OPPONENT'] +
        df_player_full['AST_TEAM'] + df_player_full['AST_OPPONENT'] +
        df_player_full['STL_TEAM'] + df_player_full['STL_OPPONENT'] +
        df_player_full['BLK_TEAM'] + df_player_full['BLK_OPPONENT'] -
        df_player_full['PF_TEAM'] - df_player_full['PF_OPPONENT'] -
        df_player_full['TO'] - df_player_full['TO_OPPONENT']
    )

    # Avoid division by zero
    df_player_full['PIE'] = df_player_full['PIE_Numerator'] / df_player_full['GAME_Total'].replace(0, np.nan)
    df_player_full['PIE'] = df_player_full['PIE'] * 100
    return df_player_full

In [22]:
# Prepare data and calculate PIE
df_pie_data = prepare_and_calculate_pie(df_player_game_logs, df_team_stats)
df_pie_data = df_pie_data.drop(['VIDEO_AVAILABLE', 'TEAM_NAME_TEAM', 'TEAM_ABBREVIATION_TEAM'], axis=1)

Preparing data and calculating PIE...


----


### Step 2: Data Preprocessing

Handle any remaining missing values.
Normalize or standardize the data as needed.
Perform feature engineering to create additional useful features.



Now that we've gathered the necessary data and calculated the Player Impact Estimate (PIE) for each player in Step 1, we proceed to Step 2: Data Preprocessing. This step involves cleaning the data, performing feature engineering, and normalizing the data to prepare it for model development.

#### 2.1 Data Cleaning


Convert Data Types if Necessary:

Ensure that numerical columns are of type float or int.
Convert date columns to datetime type

In [23]:
# Check data types
print("Data types of df_pie_data columns:")
print(df_pie_data.dtypes)

Data types of df_pie_data columns:
SEASON_ID               object
PLAYER_ID                int64
PLAYER_NAME             object
TEAM_ID                  int64
TEAM_ABBREVIATION       object
                        ...   
PTS_OPPONENT             int64
PLUS_MINUS_OPPONENT    float64
PIE_Numerator            int64
GAME_Total               int64
PIE                    float64
Length: 79, dtype: object


In [24]:
# Convert 'GAME_DATE' to datetime
df_pie_data['GAME_DATE'] = pd.to_datetime(df_pie_data['GAME_DATE'])

# Convert multiple columns to categorical
categorical_columns = ['PLAYER_ID', 'TEAM_ID', 'SEASON_ID', 'OPPONENT_TEAM_ID']
for col in categorical_columns:
    df_pie_data[col] = df_pie_data[col].astype('object') # .astype('category')



Impute Missing Values:

For numerical columns, we can fill missing values with the mean or median.
For categorical columns, we can fill missing values with the mode.


In [25]:
# Check for missing values in df_pie_data
missing_values = df_pie_data.isnull().sum()
print("Missing values in each column:")
print(missing_values[missing_values > 0])


Missing values in each column:
FG_PCT      405
FG3_PCT    1396
FT_PCT     3013
dtype: int64


In [26]:
# fill missing values in numerical columns
numerical_cols = df_pie_data.select_dtypes(include=['float64', 'int64']).columns
df_pie_data[numerical_cols] = df_pie_data[numerical_cols].fillna(0)

# For categorical columns, fill missing values with 'Unknown'
# categorical_cols = df_pie_data.select_dtypes(include=['object']).columns
# df_pie_data[categorical_cols] = df_pie_data[categorical_cols].fillna('Unknown')

2.1.3 Remove Duplicates
Check for and remove any duplicate records.

In [27]:
# Check for duplicates
duplicates = df_pie_data.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
# df_pie_data = df_pie_data.drop_duplicates()

Number of duplicate rows: 0


-----


## 2.2 Feature Engineering


### 2.2.1 Aggregate Player Statistics

Aggregate PIE and other statistics for each player over the season.

In [28]:
def aggregate_player_pie(df_pie_data):
    print("Aggregating PIE and other statistics...")
    
    df_pie_data['HOME'] = df_pie_data[['MATCHUP']].apply(lambda x: x.str.contains('vs.'))
    df_pie_data['W/L'] = df_pie_data[['WL']].apply(lambda x: (x == 'W')) # Encode 'WL' as Binary (Win = 1, Loss = 0)
    
    df_player_pie = df_pie_data.groupby(['PLAYER_ID', 'PLAYER_NAME']).agg({
        'PIE': 'mean', 'MIN': 'sum', 'PTS': 'sum', 
        'REB': 'sum', 'OREB': 'sum', 'DREB': 'sum',
        'AST': 'sum', 'STL': 'sum', 'BLK': 'sum', 
        'TOV': 'sum', 'PF': 'mean', 'GAME_ID': 'nunique', 
        'FGA': 'sum', 'FGM': 'sum', 'FTA': 'sum', 
        'FTM': 'sum', 'FG3A': 'sum', 'FG3M': 'sum',
        'PLUS_MINUS': 'sum', 'FANTASY_PTS': 'sum', 'HOME': 'mean', 'W/L': 'mean',
}).reset_index()

    df_player_pie.rename(columns={'GAME_ID': 'GAMES_PLAYED'}, inplace=True)    
    return df_player_pie.sort_values(by='PIE', ascending=False, ignore_index=True)

In [29]:
df_player_pie_aggregate = aggregate_player_pie(df_pie_data)
# df_player_pie_aggregate.to_csv('NBA_Player_PIE_2025.csv', index=False)

df_player_pie_aggregate.head()

Aggregating PIE and other statistics...


Unnamed: 0,PLAYER_ID,PLAYER_NAME,PIE,MIN,PTS,REB,OREB,DREB,AST,STL,...,FGA,FGM,FTA,FTM,FG3A,FG3M,PLUS_MINUS,FANTASY_PTS,HOME,W/L
0,203999,Nikola Jokić,16.624153,563,444,198,62,136,160,22,...,297,167,94,77,65,33,134,970.6,0.6,0.6
1,203507,Giannis Antetokounmpo,15.652736,597,560,203,36,167,113,9,...,368,224,178,109,14,3,8,1016.1,0.647059,0.529412
2,203076,Anthony Davis,14.070217,709,556,230,51,179,66,25,...,367,199,182,143,42,15,-28,1084.0,0.5,0.6
3,1628983,Shai Gilgeous-Alexander,13.485033,691,599,109,19,90,130,34,...,413,208,165,141,122,42,209,1033.8,0.5,0.75
4,1627734,Domantas Sabonis,12.899234,698,393,245,62,183,122,15,...,239,148,92,76,49,21,67,874.0,0.473684,0.421053


In [30]:
# Display top players by PIE
df_top_players_pie = df_player_pie_aggregate.sort_values(by='PIE', ascending=False)
print("\nTop 10 Players by PIE:")
print(df_top_players_pie[['PLAYER_NAME', 'PIE', 'GAMES_PLAYED']].head(30))


Top 10 Players by PIE:
                PLAYER_NAME        PIE  GAMES_PLAYED
0              Nikola Jokić  16.624153            15
1     Giannis Antetokounmpo  15.652736            17
2             Anthony Davis  14.070217            20
3   Shai Gilgeous-Alexander  13.485033            20
4          Domantas Sabonis  12.899234            19
5            Paolo Banchero  12.594644             5
6        Karl-Anthony Towns  12.567302            19
7         Victor Wembanyama  12.100549            17
8               Luka Dončić  11.924719            15
9              Jayson Tatum  11.694389            21
10       Isaiah Hartenstein  11.511774             5
11             Franz Wagner  11.441629            22
12           Alperen Sengun  11.318769            21
13           Damian Lillard  11.035651            16
14             James Harden  11.023407            22
15             Kevin Durant  10.859053            12
16             LeBron James  10.830437            21
17              LaMelo

#### 2.2.2 Create Per-Game and Per-Minute Statistics




Created per-game and per-minute statistics.
Calculated advanced efficiency metrics like TS%, eFG%, and AST/TOV Ratio.

In [31]:
# Per-game averages
df_player_pie_aggregate['PTS_per_Game'] = df_player_pie_aggregate['PTS'] / df_player_pie_aggregate['GAMES_PLAYED']
df_player_pie_aggregate['REB_per_Game'] = df_player_pie_aggregate['REB'] / df_player_pie_aggregate['GAMES_PLAYED']
df_player_pie_aggregate['AST_per_Game'] = df_player_pie_aggregate['AST'] / df_player_pie_aggregate['GAMES_PLAYED']
df_player_pie_aggregate['STL_per_Game'] = df_player_pie_aggregate['STL'] / df_player_pie_aggregate['GAMES_PLAYED']
df_player_pie_aggregate['BLK_per_Game'] = df_player_pie_aggregate['BLK'] / df_player_pie_aggregate['GAMES_PLAYED']
df_player_pie_aggregate['TOV_per_Game'] = df_player_pie_aggregate['TOV'] / df_player_pie_aggregate['GAMES_PLAYED']
df_player_pie_aggregate['PF_per_Game'] = df_player_pie_aggregate['PF'] /  df_player_pie_aggregate['GAMES_PLAYED']
# Per-minute averages

##### 2.2.3 Calculate Advanced Metrics



2.2.3 Calculate Efficiency Metrics
Create additional efficiency metrics such as True Shooting Percentage (TS%) and Assist-to-Turnover Ratio.

True Shooting Percentage (TS%)

In [32]:
# TS% = Points / (2 * (Field Goal Attempts + 0.44 * Free Throw Attempts))
df_player_pie_aggregate['TS%'] = df_player_pie_aggregate['PTS'] / (
    2 * (df_player_pie_aggregate['FGA'] + 0.44 * df_player_pie_aggregate['FTA'])
)

df_player_pie_aggregate['TS%'] = df_player_pie_aggregate['TS%'].replace([np.inf, -np.inf], np.nan).fillna(0)

Effective Field Goal Percentage (eFG%):



In [33]:
# eFG% = (FGM + 0.5 * FG3M) / FGA
df_player_pie_aggregate['eFG%'] = (df_player_pie_aggregate['FGM'] + 0.5 * df_player_pie_aggregate['FG3M']) / df_player_pie_aggregate['FGA']
df_player_pie_aggregate['eFG%'] = df_player_pie_aggregate['eFG%'].replace([np.inf, -np.inf], np.nan).fillna(0)

Assist-to-Turnover Ratio

In [34]:
df_player_pie_aggregate['AST_TOV_Ratio'] = df_player_pie_aggregate['AST'] / df_player_pie_aggregate['TOV'].replace(0, np.nan)
df_player_pie_aggregate['AST_TOV_Ratio'] = df_player_pie_aggregate['AST_TOV_Ratio'].fillna(0)

------


### 2.3.1. Incorporate Team and Opponent Statistics
 


#### Aggregate Opponent-Specific Statistics

1. Merge Team and Opponent Statistics:

Team Stats Aggregation: We calculate the mean of each team-related statistic for each player across all games they've played. This provides an average performance context for the player's team.

Opponent Stats Aggregation: Similarly, we calculate the mean of each opponent-related statistic faced by the player, giving insight into the quality of opponents.

Merging: We merge these aggregated stats back into df_player_aggregate using PLAYER_ID as the key. This enriches the player data with team and opponent performance metrics.



In [35]:
# Assuming df_pie_data contains team and opponent statistics per game

# 1. Aggregate Team Statistics
team_stats_cols = [
    'FG_PCT_TEAM', 'FG3_PCT_TEAM', 'FT_PCT_TEAM',
    'REB_TEAM', 'AST_TEAM', 'TOV_TEAM', 'FGA_TEAM', 'FGM_TEAM',
    'FTA_TEAM', 'FTM_TEAM', 'FG3A_TEAM', 'FG3M_TEAM',
    'PLUS_MINUS_TEAM'
]

opponent_stats_cols = [
    'FG_PCT_OPPONENT', 'FG3_PCT_OPPONENT', 'FT_PCT_OPPONENT',
    'REB_OPPONENT', 'AST_OPPONENT', 'TOV_OPPONENT', 'FGA_OPPONENT', 'FGM_OPPONENT',
    'FTA_OPPONENT', 'FTM_OPPONENT', 'FG3A_OPPONENT', 'FG3M_OPPONENT',
    'PLUS_MINUS_OPPONENT'
]

# Aggregate Team Stats per Player Aggregate
df_team_aggregate = df_pie_data.groupby(['PLAYER_ID', 'PLAYER_NAME']).agg({
    'FG_PCT_TEAM': 'mean',
    'FG3_PCT_TEAM': 'mean',
    'FT_PCT_TEAM': 'mean',
    'REB_TEAM': 'mean',
    'AST_TEAM': 'mean',
    'TO': 'mean',
    'FGA_TEAM': 'mean',
    'FGM_TEAM': 'mean',
    'FTA_TEAM': 'mean',
    'FTM_TEAM': 'mean',
    'FG3A_TEAM': 'mean',
    'FG3M_TEAM': 'mean',
    'PLUS_MINUS_TEAM': 'mean',
    
}).reset_index()

df_opponent_aggregate = df_pie_data.groupby(['PLAYER_ID', 'PLAYER_NAME']).agg({
    'FG_PCT_OPPONENT': 'mean',
    'FG3_PCT_OPPONENT': 'mean',
    'FT_PCT_OPPONENT': 'mean',
    'REB_OPPONENT': 'mean',
    'AST_OPPONENT': 'mean',
    'TO_OPPONENT': 'mean',
    'FGA_OPPONENT': 'mean',
    'FGM_OPPONENT': 'mean',
    'FTA_OPPONENT': 'mean',
    'FTM_OPPONENT': 'mean',
    'FG3A_OPPONENT': 'mean',
    'FG3M_OPPONENT': 'mean',
    'PLUS_MINUS_OPPONENT': 'mean'
}).reset_index()


#### Merge All Statistics:



In [36]:
# 2. Merge Aggregated Team Stats with Player Aggregated Data
# Merge player, team, and opponent stats

df_full_player_stats = pd.merge(
    df_player_pie_aggregate, df_team_aggregate.drop(columns=['PLAYER_NAME']), 
    on='PLAYER_ID', how='left')

df_full_player_stats = pd.merge(
    df_full_player_stats,
    df_opponent_aggregate.drop(columns=['PLAYER_NAME']),
    on='PLAYER_ID',
    how='left'
)

df_full_player_stats

Unnamed: 0,PLAYER_ID,PLAYER_NAME,PIE,MIN,PTS,REB,OREB,DREB,AST,STL,...,REB_OPPONENT,AST_OPPONENT,TO_OPPONENT,FGA_OPPONENT,FGM_OPPONENT,FTA_OPPONENT,FTM_OPPONENT,FG3A_OPPONENT,FG3M_OPPONENT,PLUS_MINUS_OPPONENT
0,203999,Nikola Jokić,16.624153,563,444,198,62,136,160,22,...,42.400000,30.400000,13.733333,92.400000,44.200000,20.333333,15.800000,38.400000,14.400000,-2.600000
1,203507,Giannis Antetokounmpo,15.652736,597,560,203,36,167,113,9,...,46.058824,25.411765,12.588235,90.823529,40.588235,20.470588,17.294118,39.294118,14.294118,-0.941176
2,203076,Anthony Davis,14.070217,709,556,230,51,179,66,25,...,43.250000,28.400000,13.200000,89.000000,42.750000,19.050000,15.050000,37.400000,13.500000,1.800000
3,1628983,Shai Gilgeous-Alexander,13.485033,691,599,109,19,90,130,34,...,49.200000,24.350000,18.000000,85.750000,36.150000,25.850000,19.000000,37.600000,12.700000,-10.000000
4,1627734,Domantas Sabonis,12.899234,698,393,245,62,183,122,15,...,41.684211,26.315789,14.105263,88.421053,40.894737,21.000000,17.000000,40.526316,15.315789,-0.105263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,1631230,Dominick Barlow,-0.444444,10,1,0,0,0,0,0,...,46.500000,24.500000,15.000000,94.500000,43.500000,16.500000,13.000000,46.500000,17.000000,7.500000
487,1630556,Kessler Edwards,-0.581964,9,0,0,0,0,0,0,...,47.750000,23.750000,14.500000,89.750000,36.750000,22.750000,17.250000,38.750000,11.500000,-9.750000
488,1642450,Daniss Jenkins,-0.649709,5,0,0,0,0,0,0,...,42.000000,29.500000,10.500000,91.500000,47.000000,16.500000,11.000000,36.000000,15.500000,21.000000
489,1631306,Cole Swider,-0.668924,13,0,2,0,2,1,0,...,43.000000,28.000000,14.500000,90.000000,45.500000,19.000000,15.500000,42.500000,14.500000,15.500000



2.2.2 Create Per-Game and Per-Minute Statistics

Create per-game or per-minute statistics to normalize player performance.

Home Feature: Calculates the proportion of games played at home. A higher value indicates more home games, which can influence player performance.

Performance Ratios: Ratios like FG_PCT_Ratio compare the player's team's performance against opponents. These can highlight how well the team performs relative to the competition.

Win/Loss Encoding: The proportion of games won (WL) provides context on team success, which can correlate with player performance.


In [37]:

# Create Team vs. Opponent Performance Ratios
df_full_player_stats['FG_PCT_Ratio'] = df_full_player_stats['FG_PCT_TEAM'] / df_full_player_stats['FG_PCT_OPPONENT']
df_full_player_stats['FG3_PCT_Ratio'] = df_full_player_stats['FG3_PCT_TEAM'] / df_full_player_stats['FG3_PCT_OPPONENT']
df_full_player_stats['FT_PCT_Ratio'] = df_full_player_stats['FT_PCT_TEAM'] / df_full_player_stats['FT_PCT_OPPONENT']
df_full_player_stats['REB_Ratio'] = df_full_player_stats['REB_TEAM'] / df_full_player_stats['REB_OPPONENT']
df_full_player_stats['AST_Ratio'] = df_full_player_stats['AST_TEAM'] / df_full_player_stats['AST_OPPONENT']
df_full_player_stats['TOV_Ratio'] = df_full_player_stats['TOV'] / df_full_player_stats['TO_OPPONENT']
df_full_player_stats['PLUS_MINUS_Ratio'] = df_full_player_stats['PLUS_MINUS_TEAM'] / df_full_player_stats['PLUS_MINUS_OPPONENT']

In [38]:
# Replace infinities and NaNs resulting from division by zero
df_full_player_stats.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full_player_stats.fillna(0, inplace=True)

In [39]:
import lib.collapsible_display as stats_display

stats_display.display_collapsible_data_dict()

Variable,Description
PLAYER_ID,Unique Identifier for the Player
PLAYER_NAME,Name of the Player

Variable,Description
PIE,Player Impact Estimate

Variable,Description
MIN,Minutes Played
PTS,Points Scored
REB,Total Rebounds
OREB,Offensive Rebounds
DREB,Defensive Rebounds
AST,Assists
STL,Steals
BLK,Blocks
TOV,Turnovers
PF,Personal Fouls

Variable,Description
GAMES_PLAYED,Games Played
PTS_per_Game,Points per Game
REB_per_Game,Rebounds per Game
AST_per_Game,Assists per Game
STL_per_Game,Steals per Game
BLK_per_Game,Blocks per Game
TOV_per_Game,Turnovers per Game
PF_per_Game,Personal Fouls per Game

Variable,Description
TS%,True Shooting Percentage
eFG%,Effective Field Goal Percentage
AST_TOV_Ratio,Assist-to-Turnover Ratio

Variable,Description
HOME,Home/Away Indicator
W/L,Win/Loss Indicator

Variable,Description
FG_PCT_TEAM,Team Field Goal Percentage
FG3_PCT_TEAM,Team 3-Point Percentage
FT_PCT_TEAM,Team Free Throw Percentage
REB_TEAM,Total Team Rebounds
AST_TEAM,Total Team Assists
TO,Team Turnovers
FGA_TEAM,Team Field Goals Attempted
FGM_TEAM,Team Field Goals Made
FTA_TEAM,Team Free Throws Attempted
FTM_TEAM,Team Free Throws Made

Variable,Description
FG_PCT_OPPONENT,Opponent Field Goal Percentage
FG3_PCT_OPPONENT,Opponent 3-Point Percentage
FT_PCT_OPPONENT,Opponent Free Throw Percentage
REB_OPPONENT,Opponent Total Rebounds
AST_OPPONENT,Opponent Total Assists
TO_OPPONENT,Opponent Turnovers
FGA_OPPONENT,Opponent Field Goals Attempted
FGM_OPPONENT,Opponent Field Goals Made
FTA_OPPONENT,Opponent Free Throws Attempted
FTM_OPPONENT,Opponent Free Throws Made

Variable,Description
FG_PCT_Ratio,Field Goal Percentage Ratio (Team/Opponent)
FG3_PCT_Ratio,3-Point Percentage Ratio (Team/Opponent)
FT_PCT_Ratio,Free Throw Percentage Ratio (Team/Opponent)
REB_Ratio,Rebound Ratio (Team/Opponent)
AST_Ratio,Assist Ratio (Team/Opponent)
TOV_Ratio,Turnover Ratio (Team/Opponent)
PLUS_MINUS_Ratio,Plus/Minus Ratio (Team/Opponent)


----

## Feature Selection

I'm assessing VIF to tackle multicollinearity, implementing RFE with Random Forest, and analyzing feature importance from models. Regularization methods like Lasso are used to refine feature selection.

The goal is to select features that contribute the most to predicting your target variable (e.g., PIE - Player Impact Estimate) while avoiding redundancy and multicollinearity.

Exclude Identifiers and Non-Predictive Columns


In [40]:
# Exclude identifiers and target variable
df_features = df_full_player_stats.drop(columns=['PLAYER_ID', 'PLAYER_NAME', 'PIE'])


### 3.1 Correlation Analysis

In [41]:
# Compute correlation matrix
corr_matrix = df_features.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Identify features with correlation greater than 0.9
high_corr_features = [column for column in upper.columns if any(upper[column] > 0.9)]
print("Highly correlated features to consider dropping:")
print(high_corr_features)

# Find features with perfect correlation
perfect_corr_features = [column for column in upper.columns if any(upper[column] == 1)]
print("Features with perfect correlation:", perfect_corr_features)

Highly correlated features to consider dropping:
['PTS', 'DREB', 'FGA', 'FGM', 'FTA', 'FTM', 'FG3M', 'FANTASY_PTS', 'PTS_per_Game', 'AST_per_Game', 'BLK_per_Game', 'TOV_per_Game', 'eFG%', 'FTM_TEAM', 'PLUS_MINUS_TEAM', 'FTM_OPPONENT', 'PLUS_MINUS_OPPONENT', 'TOV_Ratio']
Features with perfect correlation: ['PLUS_MINUS_OPPONENT']


In [42]:
# Remove perfectly correlated features
features_to_drop = [
    'PTS', 'REB', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
    'FGA', 'FGM', 'FTA', 'FTM', 'FG3A', 'FG3M', 
    "FANTASY_PTS", "PLUS_MINUS", "PLUS_MINUS_TEAM", # CHECK BACK WITH THIS LINE
    'FGA_TEAM', 'FGM_TEAM', 'FTA_TEAM', 'FTM_TEAM', 'FG3A_TEAM', 'FG3M_TEAM',
    'FGA_OPPONENT', 'FGM_OPPONENT', 'FTA_OPPONENT', 'FTM_OPPONENT', 'FG3A_OPPONENT', 'FG3M_OPPONENT'
]
df_features_copy = df_features.drop(columns=perfect_corr_features + features_to_drop)

###  3.2 Variance Inflation Factor (VIF)

> Handling Multicollinearity

Set a VIF Threshold: Typically, features with VIF > 5 or 10 are considered to exhibit multicollinearity and may be removed.


In [43]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Prepare the DataFrame for VIF calculation
X = df_features_copy.assign(constant=1)  # Add intercept term

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [
    variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]

# Remove the intercept term from the results
vif_data = vif_data[vif_data['Feature'] != 'constant']

print("VIF values after cleaning:")
print(vif_data.head())

# Set a VIF threshold
vif_threshold = 10

# Identify features to keep
features_to_keep = vif_data[vif_data['VIF'] < vif_threshold]['Feature'].tolist()

print("Features selected after VIF thresholding:")
print(features_to_keep)

VIF values after cleaning:
        Feature        VIF
0           MIN  13.853085
1  GAMES_PLAYED   6.873454
2          HOME   1.451860
3           W/L   7.210980
4  PTS_per_Game   6.173802
Features selected after VIF thresholding:
['GAMES_PLAYED', 'HOME', 'W/L', 'PTS_per_Game', 'REB_per_Game', 'AST_per_Game', 'STL_per_Game', 'BLK_per_Game', 'PF_per_Game', 'AST_TOV_Ratio', 'TO', 'TO_OPPONENT', 'PLUS_MINUS_Ratio']


In [44]:
features_to_keep = ['W/L', 'PTS_per_Game', 'REB_per_Game', 'AST_per_Game', 'STL_per_Game', 'BLK_per_Game', 'PF_per_Game',
                    'AST_TOV_Ratio', 'TO', 'TO_OPPONENT']

### 3.3 Recursive Feature Elimination with Cross-Validation (RFECV)


> Split Data: Divide your data into training and testing sets.



In [45]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Define features and target
X = df_features_copy
y = df_full_player_stats['PIE']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Use RFECV with a model like Random Forest to select features based on model performance.


In [46]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Initialize RFECV
rfecv = RFECV(estimator=model, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, min_features_to_select=6)

# Fit RFECV
rfecv.fit(X_train, y_train)

# Get selected features
selected_features_RFECV = X.columns[rfecv.support_]

print("Optimal number of features:", rfecv.n_features_)
print("Selected features using RFECV:")
print(selected_features_RFECV)

Optimal number of features: 6
Selected features using RFECV:
Index(['MIN', 'PTS_per_Game', 'REB_per_Game', 'AST_per_Game', 'TS%',
       'AST_TOV_Ratio'],
      dtype='object')


### 3.4 Regularization Techniques

**Lasso Regression for Feature Selection**: 

Apply Lasso Regression (L1 regularization) to perform feature selection.



In [47]:
from sklearn.linear_model import LassoCV

# Initialize LassoCV
lasso = LassoCV(cv=5, random_state=42, max_iter=10000)

# Fit LassoCV
lasso.fit(X_train, y_train)

# Get coefficients
coef = pd.Series(lasso.coef_, index=X.columns)

# Select features with non-zero coefficients
selected_features_lasso = coef[coef != 0].index.tolist()

print("Selected features using Lasso Regression:")
print(selected_features_lasso)

Selected features using Lasso Regression:
['MIN', 'PTS_per_Game', 'REB_per_Game']


### 3.5 Feature Importance from Models

Train models that provide feature importance metrics.
Select features with high importance scores.

In [48]:
import lightgbm as lgb

# Prepare data
train_data = lgb.Dataset(X_train, label=y_train)

# Set parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'seed': 42
}

# Train model
lgb_model = lgb.train(params, train_data, num_boost_round=100)

# Feature importance
importance = lgb_model.feature_importance(importance_type='gain')
feature_names = X_train.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
feature_importance.sort_values(by='Importance', ascending=False, inplace=True)

# Select top N features
top_n = 15  # Adjust as needed
selected_features_lgb = feature_importance['Feature'].head(top_n).tolist()

print("Selected features by LightGBM:")
print(selected_features_lgb)


Selected features by LightGBM:
['PTS_per_Game', 'REB_per_Game', 'AST_per_Game', 'TS%', 'AST_TOV_Ratio', 'BLK_per_Game', 'FG_PCT_Ratio', 'STL_per_Game', 'eFG%', 'PF_per_Game', 'TO', 'MIN', 'AST_OPPONENT', 'FG_PCT_OPPONENT', 'AST_TEAM']


In [49]:
final_selected_features = list(set(selected_features_RFECV) | set(
    selected_features_lasso) | set(selected_features_lgb) | set(features_to_keep))

print("Final selected features:")
print(final_selected_features)

df_features_selected = df_features_copy[final_selected_features]

Final selected features:
['W/L', 'TO_OPPONENT', 'PF_per_Game', 'FG_PCT_OPPONENT', 'eFG%', 'STL_per_Game', 'TS%', 'BLK_per_Game', 'AST_per_Game', 'MIN', 'FG_PCT_Ratio', 'REB_per_Game', 'PTS_per_Game', 'TO', 'AST_TEAM', 'AST_OPPONENT', 'AST_TOV_Ratio']


#### Multiple Feature Selection Techniques

To ensure robust results, it's important to apply various feature selection methods:

**1. Correlation Analysis**: Calculate the correlation of each feature with the target (PIE). Drop features with very low correlation.


**2. Variance Inflation Factor (VIF)**: Calculate VIF to detect multicollinearity. Remove features with high VIF values (e.g., $\small\mathrm{VIF} \gt 5$).


**3. Recursive Feature Elimination with Cross-Validation (RFECV)**: Use RFECV with a model like Random Forest to select features based on model performance.


**4. Regularization Techniques**: Apply Lasso Regression (L1 regularization) to perform feature selection.

**5. Feature Importance from Models**: Train models that provide feature importance metrics. Select features with high importance scores.


**6. Domain Knowledge**: Leverage your understanding of basketball to keep features that are theoretically important.


------

### **2.5 Normalization**

**2.3.1 Scaling Numerical Features:** Use StandardScaler to scale numerical features.

In [50]:
from sklearn.preprocessing import StandardScaler

# Select numerical features
# numeric_features = player_stats.select_dtypes(include=[np.number]).columns.tolist()

# Exclude target variable if defined
# numeric_features.remove('Target_Variable')  # Replace with your target variable


# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
scaled_features = scaler.fit_transform(df_full_player_stats[final_selected_features])

# Create a DataFrame with scaled features
df_scaled = pd.DataFrame(scaled_features, columns=final_selected_features)

# Combine scaled features with player identifiers
df_preprocessed = pd.concat([df_full_player_stats[['PLAYER_ID', 'PLAYER_NAME', 'PIE']], df_scaled], axis=1)

### **Summary of Optimized Step 2**

- **Data Cleaning**:
  - Handled missing values efficiently.
  - Converted data types appropriately.
  - Removed duplicates and outliers.
  
- **Feature Engineering**:
  - Aggregated player, team, and opponent statistics.
  - Created per-game, per-minute, and advanced metrics.
  - Encoded categorical variables and created performance ratios.

- **Feature Selection**:
  - Removed highly correlated features to reduce multicollinearity.

- **Normalization**:
  - Scaled numerical features using `StandardScaler`.

- **Data Saving**:
  - Saved the preprocessed data for use in modeling.


- **Modular Code**: Consider breaking down your code into functions for reusability and clarity.


### **Next Steps**

With the optimized data preprocessing complete, you're ready to proceed to **Step 3: Model Development**. You can now:

- Select appropriate machine learning algorithms.
- Train models using the preprocessed data.
- Evaluate model performance.
- Interpret results and identify key features influencing player performance.



-----

## Step 3: Model Development


Next, we focus on creating an advanced machine learning pipeline to:

> Objective 1: Predict Player Ratings and/or Projected Stats in Regression Model

1. Player Ratings: Composite score representing overall performance (e.g., PIE).
2. Projected Stats: Individual player statistics for upcoming games, such as PTS, REB, or AST.

> Objective 2: Identify B|est Players and Key Features

1. Identify Best Players: Use model predictions and feature importance to rank players.
2. Determine Feature Importance: Analyze which features significantly influence player performance.



Define Objectives and Targets
Prepare Data for Modeling
Select Appropriate Machine Learning Algorithms
Train and Evaluate Models
Interpret Results and Feature Importance

### 4.1 Predicting Player Ratings (PIE)

Choose a Regression Algorithm: Start with algorithms like Linear Regression, Random Forest, or XGBoost.

Train the Model

Split the data into training and testing sets. Train the model on the training data.


In [51]:
# Update training and testing sets
X_train_final = X_train[final_selected_features]
X_test_final = X_test[final_selected_features]

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_final, y_train)

Make Predictions:

In [52]:
# Predict on the test set
y_pred = rf_model.predict(X_test_final)


Evaluate Performance:


Use appropriate metrics to evaluate model performance.
Adjust the model as necessary.


In [53]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")


MSE: 0.6226
MAE: 0.5333
R² Score: 0.9469


### Next Steps:

- **Feature Importance Analysis:** 
After training your models, analyze feature importances for each target variable to understand what influences each stat.

- **Hyperparameter Tuning:** 
Optimize the model using GridSearchCV or RandomizedSearchCV to find the best hyperparameters.

- **Model Selection:** 
Compare different algorithms (e.g., XGBoost, LightGBM) to see which performs best for each target stat.

- **Ensemble Methods:** Consider using ensemble methods to combine predictions from multiple models.


Model Evaluation
Cross-Validation: Use time-series cross-validation methods like TimeSeriesSplit.

Error Analysis: Analyze where the model performs well or poorly.

6.3. Feature Importance and Interpretation
Use SHAP values to interpret the model's predictions.



----------------------



## 4.2 Predicting Projected Stats


Repeat the same steps for each projected stat you want to predict (e.g., PTS_per_Game, REB_per_Game).

However, when we switched to predicting PTS_per_Game, we reused X_train and X_test without excluding PTS_per_Game from the features. This means that PTS_per_Game was still present in X_train and X_test, which would lead to data leakage.






In [54]:
# List of potential target variables (e.g., PTS_per_Game, REB_per_Game)
target_stats = ['PTS_per_Game', 'REB_per_Game', 'AST_per_Game']  # Add other stats as needed

for target_stat in target_stats:
    # Define features by excluding identifiers and the current target variable
    features = df_features_copy.drop(columns=  target_stats)
    
    # Define the target variable
    target = df_features_copy[target_stat]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Train the model
    from sklearn.ensemble import RandomForestRegressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test)
    
    # Evaluate the model
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\nPredicting {target_stat}:")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")



Predicting PTS_per_Game:
MSE: 10.5123
MAE: 2.2418
R² Score: 0.8118

Predicting REB_per_Game:
MSE: 2.7070
MAE: 1.2281
R² Score: 0.6528

Predicting AST_per_Game:
MSE: 0.2402
MAE: 0.2642
R² Score: 0.9415


-------------------------



# Creating a Custom Feature Set for Predicting a Player's Points + Rebounds + Assists (PRA) for the Next Upcoming Game


Predicting a player's PRA (Points + Rebounds + Assists) for their next game involves leveraging historical data, player performance trends, and contextual information about upcoming matchups. We'll focus on creating an advanced feature set that considers all original variables and includes engineered features to enhance predictive power.


Data Preparation
Feature Engineering
Feature Selection
Model Development
Putting It All Together
Next Steps



### 5.1 Data Preparation: 

Before creating the feature set, ensure your data is properly prepared. 

- Load and Clean Data: Handle missing values, duplicates, and incorrect data types.
- Historical Game Data: Use player game logs that include individual stats and contextual information.
- Upcoming Game Data: Gather information about the next game, such as the opponent, venue, and any recent news affecting player availability.

### 5.2 Feature Engineering


We'll create advanced features by considering:

Player Performance Trends
Opponent Defense Strength
Home/Away Effects
Rest Days
Injuries and Player Availability
Advanced Metrics and Ratios



#### 5.2.1. Player Performance Trends


In [55]:
df_player_game_logs.columns

Index(['SEASON_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PLUS_MINUS', 'FANTASY_PTS', 'VIDEO_AVAILABLE'],
      dtype='object')

In [56]:
# Ensure data is sorted by player and game date
df_game_logs = df_pie_data.sort_values(by=['PLAYER_ID', 'GAME_DATE'])

# Create lag features for PTS, REB, AST
for lag in range(1, 4):  # Last 3 games
    df_game_logs[f'PTS_Lag{lag}'] = df_game_logs.groupby('PLAYER_ID')['PTS'].shift(lag)
    df_game_logs[f'REB_Lag{lag}'] = df_game_logs.groupby('PLAYER_ID')['REB'].shift(lag)
    df_game_logs[f'AST_Lag{lag}'] = df_game_logs.groupby('PLAYER_ID')['AST'].shift(lag)

# Rolling averages over the last 3 games
df_game_logs['PTS_RollingMean_3'] = df_game_logs.groupby('PLAYER_ID')['PTS'].rolling(window=3).mean().shift(1).reset_index(level=0, drop=True)
df_game_logs['REB_RollingMean_3'] = df_game_logs.groupby('PLAYER_ID')['REB'].rolling(window=3).mean().shift(1).reset_index(level=0, drop=True)
df_game_logs['AST_RollingMean_3'] = df_game_logs.groupby('PLAYER_ID')['AST'].rolling(window=3).mean().shift(1).reset_index(level=0, drop=True)


Form Features: Indicators of whether a player is on a hot streak.

In [57]:
# Difference between recent rolling mean and season average
df_game_logs['PTS_SeasonAvg'] = df_game_logs.groupby('PLAYER_ID')['PTS'].transform('mean')
df_game_logs['PTS_AboveAvg'] = df_game_logs['PTS_RollingMean_3'] - df_game_logs['PTS_SeasonAvg']


### 5.2.2. Opponent Defense Strength


> Calculate Team Defensive Stats

Compute the average points allowed, rebounds allowed, assists allowed, etc., by each team.

In [58]:
# Calculate defensive stats by aggregating opponent stats
team_defense_stats = df_pie_data.groupby('OPPONENT_TEAM_ID').agg({
    'PTS': 'mean',
    'REB': 'mean',
    'AST': 'mean',
    # Add other stats as needed
}).reset_index()

# Rename columns for clarity
team_defense_stats.rename(columns={
    'OPPONENT_TEAM_ID': 'TEAM_ID',
    'PTS': 'PTS_ALLOWED',
    'REB': 'REB_ALLOWED',
    'AST': 'AST_ALLOWED'
    # Add other stats as needed
}, inplace=True)

# Merge opponent defensive stats into player game logs
df_game_logs = df_game_logs.merge(
    team_defense_stats,
    left_on='OPPONENT_TEAM_ID',
    right_on='TEAM_ID',
    how='left'
)

In [59]:
# Key opponent defensive metrics
# For example, opponent's average points allowed per game, rebounds allowed, assists allowed
df_game_logs.rename(columns={
    'PTS_ALLOWED': 'OPP_PTS_ALLOWED',
    'REB_ALLOWED': 'OPP_REB_ALLOWED',
    'AST_ALLOWED': 'OPP_AST_ALLOWED'
}, inplace=True)

**2.3. Home/Away Effects**


Players may perform differently at home versus away games.

In [60]:
# Create a binary feature for home games
df_game_logs['HOME_GAME'] = df_game_logs['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)

**2.4. Rest Days**

The number of days since the player's last game can affect performance.

In [61]:
# Calculate days since last game
df_game_logs['GAME_DATE'] = pd.to_datetime(df_game_logs['GAME_DATE'])
df_game_logs['DAYS_SINCE_LAST_GAME'] = df_game_logs.groupby('PLAYER_ID')['GAME_DATE'].diff().dt.days

# Handle missing values for the first game
df_game_logs['DAYS_SINCE_LAST_GAME'] = df_game_logs['DAYS_SINCE_LAST_GAME'].fillna(df_game_logs['DAYS_SINCE_LAST_GAME'].mean())

- Assuming an 'INJURY_STATUS' column where 1 indicates injured, 0 otherwise
- Alternatively, use a 'GAMES_PLAYED' feature over a recent period


**2.6. Advanced Metrics and Ratios**

Include efficiency metrics and per-minute statistics.

In [62]:
# PER (Player Efficiency Rating), TS% (True Shooting Percentage), etc.
# For per-minute stats, divide totals by minutes played
# Replace zero minutes with a small value to avoid division by zero
df_game_logs['MIN'].replace(0, np.nan, inplace=True)

df_game_logs['PTS_per_MIN'] = df_game_logs['PTS'] / df_game_logs['MIN']
df_game_logs['REB_per_MIN'] = df_game_logs['REB'] / df_game_logs['MIN']
df_game_logs['AST_per_MIN'] = df_game_logs['AST'] / df_game_logs['MIN']
df_game_logs.fillna(0, inplace=True)  # Handle division by zero


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_game_logs['MIN'].replace(0, np.nan, inplace=True)


In [63]:
# TS% = Points / (2 * (Field Goal Attempts + 0.44 * Free Throw Attempts))
df_game_logs['TS%'] = df_game_logs['PTS'] / (2 * (df_game_logs['FGA'] + 0.44 * df_game_logs['FTA']))
df_game_logs['TS%'] = df_game_logs['TS%'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [64]:
# eFG% = (FGM + 0.5 * FG3M) / FGA
df_game_logs['eFG%'] = (df_game_logs['FGM'] + 0.5 * df_game_logs['FG3M']) / df_game_logs['FGA']
df_game_logs['eFG%'] = df_game_logs['eFG%'].replace([np.inf, -np.inf], np.nan).fillna(0)


---

nhance Feature Engineering
Include Advanced Metrics:

Player Usage Rate (USG%): Reflects the player's involvement in offensive plays.
Player Efficiency Rating (PER): Measures a player's per-minute performance.
Opponent-Specific Defensive Metrics:
Defensive Matchup Data: Include information about the specific players who will defend your player.
Opponent's Defensive Rating Against Position: How well the opponent defends players of the same position.

In [65]:

# Define a function to compute cumulative defensive stats for each opponent
def compute_cumulative_opponent_stats(group):
    group = group.sort_values('GAME_DATE')
    group['OPP_PTS_ALLOWED_CUM'] = group['PTS_TEAM'].expanding().mean().shift(1)
    group['OPP_REB_ALLOWED_CUM'] = group['REB_TEAM'].expanding().mean().shift(1)
    group['OPP_AST_ALLOWED_CUM'] = group['AST_TEAM'].expanding().mean().shift(1)
    return group

# Apply the function to each opponent team
df = df_game_logs.groupby('OPPONENT_TEAM_ID').apply(compute_cumulative_opponent_stats).reset_index(drop=True)


# Replace NaN values with overall league averages or zeros
df['OPP_PTS_ALLOWED_CUM'].fillna(df['PTS_TEAM'].mean(), inplace=True)
df['OPP_REB_ALLOWED_CUM'].fillna(df['REB_TEAM'].mean(), inplace=True)
df['OPP_AST_ALLOWED_CUM'].fillna(df['AST_TEAM'].mean(), inplace=True)


  df = df_game_logs.groupby('OPPONENT_TEAM_ID').apply(compute_cumulative_opponent_stats).reset_index(drop=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['OPP_PTS_ALLOWED_CUM'].fillna(df['PTS_TEAM'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['OPP_REB_ALLOWED_CUM'].fillna(df['REB_TEAM'].me

**Player Usage Rate**

$$
\mathrm{USG}\% = \frac{(Player FGA + 0.44 * PLAYER FTA + PLAYER TOV) * (TEAM MINS PLAYED/5)}{PLAYER MIN PLAYED * (TEAM FGA + 0.44*TEAM FTA + TEAM TOV)}
$$

USG%=100× 
Player Minutes Played×(Team FGA+0.44×Team FTA+Team TOV)
(Player FGA+0.44×Player FTA+Player TOV)×(Team Minutes Played/5)
​


In [66]:
def parse_min_team(value):
    try:
        # Remove any extra characters and keep numeric part
        # Remove ':00' and convert to float
        if isinstance(value, str):
            value = value.replace(':00', '')
            minutes = float(value)
            return minutes
        else:
            return value
    except Exception as e:
        print(f"Error parsing MIN_TEAM value '{value}': {e}")
        return np.nan

df['MIN_TEAM'] = df['MIN_TEAM'].apply(parse_min_team)


In [67]:
# Calculate team possessions (denominator of the formula)
df['TEAM_POSS'] = df['FGA_TEAM'] + 0.44 * df['FTA_TEAM'] + df['TO']

# Calculate player possessions (numerator of the formula)
df['PLAYER_POSS'] = df['FGA'] + 0.44 * df['FTA'] + df['TOV']

# Now, calculate USG%
df['USG%'] = 100 * (df['PLAYER_POSS'] * (df['MIN_TEAM'] / 5)) / (df['MIN'] * df['TEAM_POSS'])

# Handle potential division by zero
df['USG%'] = df['USG%'].replace([np.inf, -np.inf], np.nan)
df['USG%'] = df['USG%'].fillna(0)



In [68]:

# Ensure 'USG%' is numeric
df['USG%'] = pd.to_numeric(df['USG%'], errors='coerce')

# Calculate rolling average of USG% over the last 5 games
df['USG%_Rolling3'] = df.groupby('PLAYER_ID')['USG%'].rolling(window=3).mean().shift(1).reset_index(level=0, drop=True)


---

In [69]:
# Ensure all required columns are numeric
numeric_cols = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'FGM', 'FGA', 'FTM', 'FTA', 'TOV', 'MIN']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values by filling with zeros
df[numeric_cols] = df[numeric_cols].fillna(0)

# Calculate missed field goals and free throws
df['Missed_FG'] = df['FGA'] - df['FGM']
df['Missed_FT'] = df['FTA'] - df['FTM']

# Calculate unadjusted PER (uPER)
df['uPER'] = (
    df['PTS'] +
    df['REB'] +
    df['AST'] +
    df['STL'] +
    df['BLK'] -
    df['Missed_FG'] -
    df['Missed_FT'] -
    df['TOV']
)

# Avoid division by zero
df['MIN'] = df['MIN'].replace(0, np.nan)

# Calculate PER per minute
df['PER'] = df['uPER'] / df['MIN']

# Handle infinite and NaN values
df['PER'] = df['PER'].replace([np.inf, -np.inf], np.nan)
df['PER'] = df['PER'].fillna(0)

# Calculate rolling average of USG% over the last 5 games
df['PER%_Rolling3'] = df.groupby('PLAYER_ID')['PER'].rolling(window=3).mean().shift(1).reset_index(level=0, drop=True)


---

In [70]:
df.columns.to_list()


['SEASON_ID',
 'PLAYER_ID',
 'PLAYER_NAME',
 'TEAM_ID_x',
 'TEAM_ABBREVIATION',
 'TEAM_NAME',
 'GAME_ID',
 'GAME_DATE',
 'MATCHUP',
 'WL',
 'MIN',
 'FGM',
 'FGA',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'PLUS_MINUS',
 'FANTASY_PTS',
 'TEAM_CITY',
 'MIN_TEAM',
 'FGM_TEAM',
 'FGA_TEAM',
 'FG_PCT_TEAM',
 'FG3M_TEAM',
 'FG3A_TEAM',
 'FG3_PCT_TEAM',
 'FTM_TEAM',
 'FTA_TEAM',
 'FT_PCT_TEAM',
 'OREB_TEAM',
 'DREB_TEAM',
 'REB_TEAM',
 'AST_TEAM',
 'STL_TEAM',
 'BLK_TEAM',
 'TO',
 'PF_TEAM',
 'PTS_TEAM',
 'PLUS_MINUS_TEAM',
 'OPPONENT_TEAM_ID',
 'TEAM_NAME_OPPONENT',
 'TEAM_ABBREVIATION_OPPONENT',
 'TEAM_CITY_OPPONENT',
 'MIN_OPPONENT',
 'FGM_OPPONENT',
 'FGA_OPPONENT',
 'FG_PCT_OPPONENT',
 'FG3M_OPPONENT',
 'FG3A_OPPONENT',
 'FG3_PCT_OPPONENT',
 'FTM_OPPONENT',
 'FTA_OPPONENT',
 'FT_PCT_OPPONENT',
 'OREB_OPPONENT',
 'DREB_OPPONENT',
 'REB_OPPONENT',
 'AST_OPPONENT',
 'STL_OPPONENT',
 'BLK_OPPO

In [71]:
df['POSS_OPPONENT'] = df['FGA_OPPONENT'] + 0.44 * df['FTA_OPPONENT'] - df['OREB_OPPONENT'] + df['TO_OPPONENT']
df['DEF_RTG'] = 100 * (df['PTS_OPPONENT'] / df['POSS_OPPONENT'])


df['DEF_RTG'] = df['DEF_RTG'].replace([np.inf, -np.inf], np.nan)
df['DEF_RTG'] = df['DEF_RTG'].fillna(df['DEF_RTG'].mean())


In [72]:
df = df.sort_values(['TEAM_ID_x', 'GAME_DATE'])

df['DEF_RTG_CUM'] = df.groupby('TEAM_ID_x')['DEF_RTG'].expanding().mean().reset_index(level=0, drop=True).shift(1)
df['DEF_RTG_ROLLING3'] = df.groupby('TEAM_ID_x')['DEF_RTG'].rolling(window=3).mean().reset_index(level=0, drop=True).shift(1)


In [73]:
df['OPP_PPG'] = df.groupby('TEAM_ID_x')['PTS_OPPONENT'].expanding().mean().reset_index(level=0, drop=True).shift(1)
df['OPP_PPG_ROLLING3'] = df.groupby('TEAM_ID_x')['PTS_OPPONENT'].rolling(window=3).mean().reset_index(level=0, drop=True).shift(1)


In [74]:
# Create interaction term
df['PTS_RollingMean_3_OPP_DEF_RTG'] = df['PTS_RollingMean_3'] / df['DEF_RTG_CUM']


In [75]:
# Ensure your DataFrame is sorted by 'PLAYER_ID' and 'GAME_DATE'
df = df.sort_values(['PLAYER_ID', 'GAME_DATE'])

# Create the 'PTS_NextGame' column
df['PTS_NextGame'] = df.groupby('PLAYER_ID')['PTS'].shift(-1)

# Drop rows where 'PTS_NextGame' is NaN
df = df.dropna(subset=['PTS_NextGame'])



In [76]:
features = [
    # Existing features
    'PTS_Lag1', 'PTS_Lag2', 'PTS_Lag3',
    'PTS_RollingMean_3', 
    'MIN',
    'FG_PCT', 
    'FGA', 
    'PLUS_MINUS',
    # Newly added opponent defensive stats
    'OPP_PTS_ALLOWED_CUM',
    'OPP_REB_ALLOWED_CUM',
    #'OPP_AST_ALLOWED_CUM',
    # Additional features
    'HOME_GAME',
    #'DAYS_SINCE_LAST_GAME',
    #'PTS_per_MIN', 
    'TS%',
    'eFG%',
    # Interaction term
    'PTS_RollingMean_3_OPP_PTS_ALLOWED',
    'USG%_Rolling3', 'USG%',
    'PER', 'PER%_Rolling3',
    'OPP_PPG', 'DEF_RTG_CUM', 'PTS_RollingMean_3_OPP_DEF_RTG',
    'PIE',
    #'Adjusted_PTS_Defender'
]

# Create the interaction term
df['PTS_RollingMean_3_OPP_PTS_ALLOWED'] = df['PTS_RollingMean_3'] / df['OPP_PTS_ALLOWED_CUM']


4.1. Define Feature Matrix and Target Variable

In [77]:
# Define the feature matrix
X = df[features]
# Define the target variable
y = df['PTS_NextGame']  # Ensure this is correctly shifted


In [78]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[features] = scaler.fit_transform(X[features])

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X[features] = imputer.fit_transform(X[features])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[features] = scaler.fit_transform(X[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[features] = imputer.fit_transform(X[features])


In [79]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [80]:
# Train your model
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    #reg_alpha=0.1,
    #reg_lambda=1.0
)

xgb_model.fit(X_train, y_train)



In [81]:

# Evaluate your model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance on Test Set:")
print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")


Model Performance on Test Set:
MSE: 28.89
MAE: 4.12
R² Score: 0.3787


---

-------------


### **Further Enhancing Your Predictive Model**


- **Shot Location Data:**
  - **Shot Zones:** Include data on where the player takes their shots (e.g., paint, mid-range, three-point line).
  - **Shot Distance:** Use the average shot distance to capture shooting tendencies.

- **Clutch Performance:**

##### **1.2. Include Contextual Game Factors**

- **Game Pace:**
  - **Team Pace:** Average number of possessions per game for the player's team.
  - **Opponent Pace:** Average number of possessions per game for the opponent team.
  - **Adjusted Pace:** Combine both to estimate the expected number of possessions.

- **Vegas Betting Lines:**
  - **Over/Under Totals:** Expected total points in the game.
  - **Point Spread:** Indicates the expected competitiveness of the game.


- **Player's Health Status:**
  - **Injury Reports:** Whether the player is probable, questionable, or doubtful.
  - **Minutes Restrictions:** Any known limitations on playing time.

- **Team Injuries:**
  - **Key Teammate Absences:** The impact of missing teammates on the player's usage and performance.

- **Defensive Schemes:**
  - **Opponent's Defensive Style:** Zone vs. man-to-man defense.
  - **Double-Team Frequency:** How often the opponent double-teams high-scoring players.

- **Matchup Data:**
  - **Historical Performance Against Opponent:** The player's average stats against the upcoming opponent.
  - **Defensive Matchups:** If possible, include data on expected primary defenders.



- **Neural Networks:**
  - **Deep Learning Models:** Implement feed-forward neural networks for capturing complex patterns.
  - **Recurrent Neural Networks (RNNs):** Useful for sequential data; consider Long Short-Term Memory (LSTM) networks.

##### **2.3. Regularization Techniques**

- **Adjust Regularization Parameters:**
  - **L1 Regularization (Lasso):** Can help with feature selection.
  - **L2 Regularization (Ridge):** Helps reduce overfitting by penalizing large coefficients.


#### **3. Feature Selection and Engineering**

##### **3.1. Feature Importance Analysis**

- **Permutation Importance:**
  - Assess the decrease in model performance when a feature's values are shuffled.

- **SHAP Values:**
  - Use SHAP to understand the contribution of each feature to individual predictions.

##### **3.2. Reduce Multicollinearity**

- **Correlation Matrix:**
  - Remove or combine highly correlated features.

- **Principal Component Analysis (PCA):**
  - Reduce dimensionality while retaining most of the variance.

##### **3.3. Create Polynomial and Interaction Features**

- **Polynomial Features:**
  - Include squared or cubic terms to capture non-linear relationships.

- **Interaction Terms:**
  - Multiply features to capture the combined effect.



#### **7. Ensemble Models**

- **Stacking:**
  - Combine multiple models where the predictions of several base models are used as inputs for a higher-level model.

- **Blending:**
  - Similar to stacking but uses a holdout set to blend predictions.

##### **8.2. Automate Data Pipelines**

- **Data Collection Automation:**
  - Ensure new data is seamlessly integrated into the model training process.

- **Real-Time Predictions:**
  - Implement APIs or services to generate predictions in real-time if needed.




---

-------------

In [82]:
# Prepare upcoming game data
df_upcoming_games['GAME_DATE'] = pd.to_datetime(df_upcoming_games['GAME_DATE'])
df_upcoming_games = df_upcoming_games.sort_values(by='GAME_DATE')

# Merge with player statistics and opponent defensive stats
# Use the latest available data for each player

# Example:
# For each player in df_upcoming_games, retrieve their latest stats
latest_stats = df_game_logs.groupby('PLAYER_ID').last().reset_index()

# Merge latest stats with upcoming games
df_pred_data = df_upcoming_games.merge(latest_stats, on='PLAYER_ID', how='left')

# Prepare features
X_pred = df_pred_data[features]

# Preprocess features
X_pred = preprocessor.transform(X_pred)

# Make predictions
df_pred_data['PRA_Prediction'] = xgb_model.predict(X_pred)

# Display predictions
print(df_pred_data[['PLAYER_NAME', 'OPPONENT_TEAM_ID', 'GAME_DATE', 'PRA_Prediction']])


NameError: name 'df_upcoming_games' is not defined

In [83]:
# Rank players
df_pred_data = df_pred_data.sort_values(by='PRA_Prediction', ascending=False)

# Display top 10 players
top_10_players = df_pred_data[['PLAYER_NAME', 'PRA_Prediction']].head(10)
print("Top 10 Players Based on Predicted PRA for Next Game:")
print(top_10_players)


NameError: name 'df_pred_data' is not defined

----