# First Model: Predicting Player Points

### Initial approach using tuned Random Forest Regressor with L5 games

In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

In [13]:
df = pd.read_csv('../../data/raw/nba_player_gamelogs_2024_25.csv')
df.head() 

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,VIDEO_AVAILABLE
0,22024,201143,Al Horford,1610612738,BOS,Boston Celtics,22400061,2024-10-22,BOS vs. NYK,W,...,3,5,1,1,0,2,11,19,28.1,1
1,22024,201950,Jrue Holiday,1610612738,BOS,Boston Celtics,22400061,2024-10-22,BOS vs. NYK,W,...,4,4,1,0,0,2,18,23,31.8,1
2,22024,2544,LeBron James,1610612747,LAL,Los Angeles Lakers,22400062,2024-10-22,LAL vs. MIN,W,...,5,4,0,2,2,3,16,-6,32.0,1
3,22024,1630559,Austin Reaves,1610612747,LAL,Los Angeles Lakers,22400062,2024-10-22,LAL vs. MIN,W,...,9,4,1,1,0,4,12,12,34.8,1
4,22024,201144,Mike Conley,1610612750,MIN,Minnesota Timberwolves,22400062,2024-10-22,MIN @ LAL,L,...,4,2,1,0,3,1,5,-22,12.8,1


In [15]:
df.columns

Index(['SEASON_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PLUS_MINUS', 'FANTASY_PTS', 'VIDEO_AVAILABLE'],
      dtype='object')

In [16]:
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
df = df.sort_values(['PLAYER_ID', 'GAME_DATE'])

In [17]:
#Get Rolling 5-game averages
stat_cols = ['PTS', 'MIN', 'FGA', 'FG3A', 'FTA', 'AST', 'REB']
for col in stat_cols:
    # shift(1) ensures we don't include the 'current' game in our features
    df[f'{col}_roll5'] = df.groupby('PLAYER_ID')[col].transform(lambda x: x.shift(1).rolling(5).mean())

# Drop players without enough history
df_clean = df.dropna(subset=[f'{col}_roll5' for col in stat_cols])

In [18]:
# Train/Test Split (Test on March 1st, 2025)
test_date = pd.to_datetime('2025-03-01')
test_players = df_clean[df_clean['GAME_DATE'] == test_date]['PLAYER_ID'].unique()

train_df = df_clean[(df_clean['GAME_DATE'] < test_date) & (df_clean['PLAYER_ID'].isin(test_players))]
test_df = df_clean[df_clean['GAME_DATE'] == test_date]

features = [f'{col}_roll5' for col in stat_cols]
X_train, y_train = train_df[features], train_df['PTS']
X_test, y_test = test_df[features], test_df['PTS']

In [19]:
# Model Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(rf, param_grid, n_iter=10, cv=3, random_state=42)
rf_random.fit(X_train, y_train)

# 5. Predict and Evaluate
predictions = rf_random.best_estimator_.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, predictions)}")

MAE: 5.010283563739507
