In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading the data

In [25]:
import pandas as pd
import time
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog
from requests.exceptions import ReadTimeout
from tqdm import tqdm
import seaborn as sns
import plotly.express as px



file_path = '/content/drive/MyDrive/nba_all_players_game_logs_2023-24.csv'
df = pd.read_csv(file_path)

print(df.head())


   SEASON_ID  Player_ID   Game_ID     GAME_DATE      MATCHUP WL  MIN  FGM  \
0      22023    1630173  22301190  Apr 14, 2024  NYK vs. CHI  W   19    2   
1      22023    1630173  22301175  Apr 12, 2024  NYK vs. BKN  W    8    2   
2      22023    1630173  22301167  Apr 11, 2024    NYK @ BOS  W   16    1   
3      22023    1630173  22301139  Apr 07, 2024    NYK @ MIL  W    5    0   
4      22023    1630173  22301119  Apr 05, 2024    NYK @ CHI  L   19    0   

   FGA  FG_PCT  ...  REB  AST  STL  BLK  TOV  PF  PTS  PLUS_MINUS  \
0    3   0.667  ...    5    2    0    1    2   3    4          -2   
1    2   1.000  ...    3    0    0    0    0   0    5           3   
2    6   0.167  ...    5    0    0    1    1   0    2          -9   
3    1   0.000  ...    0    0    0    0    0   1    0           5   
4    2   0.000  ...    4    1    1    0    1   4    0          -2   

   VIDEO_AVAILABLE       PLAYER_NAME  
0                1  Precious Achiuwa  
1                1  Precious Achiuwa  
2    

#  Basic info

In [26]:

print("Dataset shape:", df.shape)
print("\nColumn names:\n", df.columns)
print("\nData types:\n", df.dtypes)

Dataset shape: (23770, 28)

Column names:
 Index(['SEASON_ID', 'Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE', 'PLAYER_NAME'],
      dtype='object')

Data types:
 SEASON_ID            int64
Player_ID            int64
Game_ID              int64
GAME_DATE           object
MATCHUP             object
WL                  object
MIN                  int64
FGM                  int64
FGA                  int64
FG_PCT             float64
FG3M                 int64
FG3A                 int64
FG3_PCT            float64
FTM                  int64
FTA                  int64
FT_PCT             float64
OREB                 int64
DREB                 int64
REB                  int64
AST                  int64
STL                  int64
BLK                  int64
TOV                  int64
PF        

In [27]:
# Quick look at the first 5 rows
print("\nFirst 5 rows:\n", df.head())


First 5 rows:
    SEASON_ID  Player_ID   Game_ID     GAME_DATE      MATCHUP WL  MIN  FGM  \
0      22023    1630173  22301190  Apr 14, 2024  NYK vs. CHI  W   19    2   
1      22023    1630173  22301175  Apr 12, 2024  NYK vs. BKN  W    8    2   
2      22023    1630173  22301167  Apr 11, 2024    NYK @ BOS  W   16    1   
3      22023    1630173  22301139  Apr 07, 2024    NYK @ MIL  W    5    0   
4      22023    1630173  22301119  Apr 05, 2024    NYK @ CHI  L   19    0   

   FGA  FG_PCT  ...  REB  AST  STL  BLK  TOV  PF  PTS  PLUS_MINUS  \
0    3   0.667  ...    5    2    0    1    2   3    4          -2   
1    2   1.000  ...    3    0    0    0    0   0    5           3   
2    6   0.167  ...    5    0    0    1    1   0    2          -9   
3    1   0.000  ...    0    0    0    0    0   1    0           5   
4    2   0.000  ...    4    1    1    0    1   4    0          -2   

   VIDEO_AVAILABLE       PLAYER_NAME  
0                1  Precious Achiuwa  
1                1  Precious

In [28]:
# Check for missing values
print("\nMissing values per column:\n", df.isnull().sum())


Missing values per column:
 SEASON_ID          0
Player_ID          0
Game_ID            0
GAME_DATE          0
MATCHUP            0
WL                 0
MIN                0
FGM                0
FGA                0
FG_PCT             0
FG3M               0
FG3A               0
FG3_PCT            0
FTM                0
FTA                0
FT_PCT             0
OREB               0
DREB               0
REB                0
AST                0
STL                0
BLK                0
TOV                0
PF                 0
PTS                0
PLUS_MINUS         0
VIDEO_AVAILABLE    0
PLAYER_NAME        0
dtype: int64


# Number of unique players

In [43]:
print("\nUnique players in dataset:", df['PLAYER_NAME'].nunique())


Unique players in dataset: 448


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23770 entries, 0 to 23769
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SEASON_ID        23770 non-null  int64  
 1   Player_ID        23770 non-null  int64  
 2   Game_ID          23770 non-null  int64  
 3   GAME_DATE        23770 non-null  object 
 4   MATCHUP          23770 non-null  object 
 5   WL               23770 non-null  object 
 6   MIN              23770 non-null  int64  
 7   FGM              23770 non-null  int64  
 8   FGA              23770 non-null  int64  
 9   FG_PCT           23770 non-null  float64
 10  FG3M             23770 non-null  int64  
 11  FG3A             23770 non-null  int64  
 12  FG3_PCT          23770 non-null  float64
 13  FTM              23770 non-null  int64  
 14  FTA              23770 non-null  int64  
 15  FT_PCT           23770 non-null  float64
 16  OREB             23770 non-null  int64  
 17  DREB        

In [31]:
# Convert GAME_DATE to datetime
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

# Extract Home/Away and Opponent
df['HOME'] = df['MATCHUP'].apply(lambda x: 'Home' if '@' not in x else 'Away')
df['OPPONENT'] = df['MATCHUP'].apply(lambda x: x.split()[-1])

#  General Stats

In [44]:
print("Total players:", df['PLAYER_NAME'].nunique())
print("Total games:", df['Game_ID'].nunique())
print("Average points per game:", df['PTS'].mean())
print("Average rebounds per game:", df['REB'].mean())
print("Average assists per game:", df['AST'].mean())

Total players: 448
Total games: 1230
Average points per game: 11.341144299537232
Average rebounds per game: 4.275010517458982
Average assists per game: 2.6312578880942366


# Top Players

In [33]:

top_pts = df.groupby('PLAYER_NAME')['PTS'].mean().sort_values(ascending=False).head(10)
print("\nTop 10 players by PPG:")
print(top_pts)

top_games = df['PLAYER_NAME'].value_counts().head(10)
print("\nTop 10 players by games played:")
print(top_games)


Top 10 players by PPG:
PLAYER_NAME
Joel Embiid                34.692308
Luka Dončić                33.857143
Giannis Antetokounmpo      30.438356
Shai Gilgeous-Alexander    30.053333
Jalen Brunson              28.727273
Kevin Durant               27.093333
Devin Booker               27.073529
Jayson Tatum               26.851351
Donovan Mitchell           26.600000
De'Aaron Fox               26.567568
Name: PTS, dtype: float64

Top 10 players by games played:
PLAYER_NAME
Buddy Hield          84
Harrison Barnes      82
Mikal Bridges        82
Cason Wallace        82
Obi Toppin           82
Jonas Valančiūnas    82
Domantas Sabonis     82
Georges Niang        82
Austin Reaves        82
Bobby Portis         82
Name: count, dtype: int64


# Exploratory Data Analysis

In [34]:
# --- Create Histogram with Plotly ---
fig = px.histogram(
    df,
    x='PTS',
    nbins=30,
    title='Distribution of Points per Game',
    labels={'PTS': 'Points', 'count': 'Frequency'}, # Use labels to rename axes
    marginal='rug', # Adds a rug plot to show individual data points
    opacity=0.75
)

# Update layout for a cleaner look
fig.update_layout(
    yaxis_title='Frequency',
    bargap=0.1
)

fig.show()

### The Distribution of Points is Right-Skewed

This histogram reveals a strong **right-skewed distribution** for points scored per game.

* **High Frequency of Low Scores:** The tallest bars are on the left, indicating that lower point totals (roughly 0-15 points) are the most common outcomes for a player in a game. This reflects the reality that many players have limited scoring roles.
* **Long Tail of High Scores:** The long "tail" extending to the right shows that very high-scoring games (40+) are rare but do occur. These performances are the "outliers" typically produced by elite scorers.

**Implication:** This pattern tells us that while the model must be accurate in predicting common lower-end scores, the real challenge will be forecasting those rare, high-scoring breakout games.

In [35]:
# --- Create Histogram with Plotly ---
fig = px.histogram(
    df,
    x='REB',
    nbins=30,
    title='Distribution of Rebounds per Game',
    labels={'REB': 'Rebounds'},
    marginal='rug',
    color_discrete_sequence=['green'], # Sets the bar color to green
    opacity=0.75
)

# Update layout for a cleaner look
fig.update_layout(
    yaxis_title='Frequency',
    bargap=0.1
)

fig.show()

In [36]:
# --- Create Histogram with Plotly ---
fig = px.histogram(
    df,
    x='AST',
    nbins=30,
    title='Distribution of Assists per Game',
    labels={'AST': 'Assists'},
    marginal='rug',
    color_discrete_sequence=['blue'], # Sets the bar color to blue
    opacity=0.75
)

# Update layout for a cleaner look
fig.update_layout(
    yaxis_title='Frequency',
    bargap=0.1
)

fig.show()

In [40]:
# --- Data Preparation ---
# Convert GAME_DATE to datetime
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

# Find the top player by average points, just as in your original code
top_player = df.groupby('PLAYER_NAME')['PTS'].mean().idxmax()

# Filter the DataFrame for only the top player and sort by date
player_df = df[df['PLAYER_NAME'] == top_player].sort_values('GAME_DATE')


# --- Create Line Plot with Plotly ---
fig = px.line(
    player_df,
    x='GAME_DATE',
    y='PTS',
    title=f'{top_player} Points Over the Season',
    labels={'GAME_DATE': 'Game Date', 'PTS': 'Points'},
    markers=True # Adds the circular markers to each data point
)

# Update layout for better readability of x-axis labels
fig.update_layout(xaxis_tickangle=-45)

fig.show()

In [41]:
# --- Data Preparation ---
# Define the numeric columns for the correlation matrix
numeric_cols = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
                'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']

# Calculate the correlation matrix
corr_matrix = df[numeric_cols].corr()

# --- Create Heatmap with Plotly ---
fig = px.imshow(
    corr_matrix,
    text_auto=True,  # Automatically writes the correlation values on the heatmap
    aspect="auto",   # Adjusts aspect ratio to fit the figure
    color_continuous_scale='RdBu_r', # A color scale similar to 'coolwarm'
    title='Correlation Heatmap of Stats'
)

# Update layout for a larger size
fig.update_layout(
    width=1000,
    height=800
)

fig.show()

### Finding: Opportunity Drives Scoring

This correlation heatmap provides the clearest insight from our EDA: a player's scoring is heavily driven by their opportunity on the court.

* **Strongest Positive Correlations:** The stats with the highest positive correlation (values closest to +1.0) to **Points (PTS)** are all related to a player's offensive role and aggression:
    * **FGM (Field Goals Made):** A correlation of **0.97**, as made shots are the direct source of points.
    * **FGA (Field Goals Attempted):** A correlation of **0.89**, proving that shot volume is a powerful predictor of scoring output.
    * **MIN (Minutes Played):** A correlation of **0.74**, indicating that more time on the court leads to more scoring opportunities.

**Implication:** This was a crucial finding. It confirmed that our predictive model must prioritize features related to a player's recent playing time and shot attempts to be accurate. This insight directly guids our feature engineering strategy.

## Feature Engineering - Turning Insights into Features

Our Exploratory Data Analysis (EDA) provided two critical insights that directly guided our feature engineering strategy:

1.  **Performance is Volatile:** The time-series plot showed that a player's performance fluctuates significantly, meaning their **recent form** is a key predictive factor.
2.  **Opportunity Drives Scoring:** The correlation heatmap revealed a strong link between points and 'opportunity' stats like **Minutes Played (MIN)** and **Field Goal Attempts (FGA)**.

Therefore, our strategy was to create new features that explicitly quantify a player's **recent opportunity** and add important **game context**.

### The Engineered Features
To execute this, we created two types of features:

* **Rolling Averages:** We engineer **5-game rolling averages** for key opportunity stats (e.g., `FGA_roll_5`, `MIN_roll_5`). These features capture a player's recent role and offensive involvement, directly addressing the findings from our EDA.
* **Game Context Features:** We added a **`Days_Rest`** feature to account for player fatigue and a **`HOME`** feature for game location, adding important situational context to each game record.

By creating these features, we can transform the raw dataset into an enhanced one, where each row now contains a rich profile of the player's performance leading up to that game, preparing it for the modeling phase.

In [38]:
# --- Data Preparation ---
# Ensure GAME_DATE is a datetime object
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

# Sort data chronologically for each player. This is crucial for rolling averages.
df = df.sort_values(['PLAYER_NAME', 'GAME_DATE'])


# --- Feature Engineering: Rolling Averages ---
# Define the columns you want to create rolling averages for
rolling_cols = ['PTS', 'MIN', 'FGA', 'FTA', 'AST', 'REB', 'FG3A', 'TOV']
new_cols = [f'{col}_roll_5' for col in rolling_cols]

# Group by player and calculate the rolling average over the last 5 games
# .shift(1) is used to ensure we only use data from PAST games to prevent data leakage.
df[new_cols] = df.groupby('PLAYER_NAME')[rolling_cols].rolling(window=5, min_periods=1).mean().shift(1).reset_index(level=0, drop=True)


# --- Feature Engineering: Game Context ---
# Create a binary 'HOME' feature (1 for home, 0 for away)
df['HOME'] = df['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)

# Calculate days of rest since the last game
df['Days_Rest'] = df.groupby('PLAYER_NAME')['GAME_DATE'].diff().dt.days

# Fill NaN values for Days_Rest (occurs for the first game of the season) with a neutral value like 7
df['Days_Rest'].fillna(7, inplace=True)


# --- Display Results ---
# Select a few key columns to see the new features
display_cols = ['PLAYER_NAME', 'GAME_DATE', 'PTS', 'PTS_roll_5', 'MIN_roll_5', 'FGA_roll_5', 'Days_Rest']
print("DataFrame with new rolling average and context features:")
df[display_cols].head(10)


DataFrame with new rolling average and context features:



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0,PLAYER_NAME,GAME_DATE,PTS,PTS_roll_5,MIN_roll_5,FGA_roll_5,Days_Rest
12291,A.J. Lawson,2023-12-01,1,,,,7.0
12290,A.J. Lawson,2023-12-02,12,1.0,4.0,1.0,1.0
12289,A.J. Lawson,2023-12-06,4,6.5,11.5,5.5,4.0
12288,A.J. Lawson,2023-12-08,2,5.666667,10.0,4.333333,2.0
12287,A.J. Lawson,2023-12-11,4,4.75,9.25,4.0,3.0
12286,A.J. Lawson,2023-12-12,0,4.6,10.2,4.6,1.0
12285,A.J. Lawson,2023-12-14,2,4.4,9.4,4.4,2.0
12284,A.J. Lawson,2023-12-16,0,2.4,6.0,2.6,2.0
12283,A.J. Lawson,2023-12-18,6,1.6,4.8,2.2,2.0
12282,A.J. Lawson,2023-12-20,0,2.4,6.4,2.4,2.0
