## Imports and DB connection

In [13]:
import os
from dotenv import load_dotenv
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import joblib
from sqlalchemy import create_engine
import psycopg2

In [14]:
# load env variables
load_dotenv()
DATABASE_URL = os.getenv("ML_DATABASE_URL")

# connect to postgres db with sqlalchemy engine
engine = create_engine(DATABASE_URL)

# get player stats
query = """
SELECT pg.player_id, pg.game_id, pg.game_date, pg.matchup, p.team_abbreviation,
       pg.minutes, pg.points, pg.assists, pg.rebounds, pg.steals, pg.blocks, pg.turnovers
FROM player_game_stats pg
JOIN players p ON pg.player_id = p.id
"""
df_raw = pd.read_sql(query, engine)
df_raw.head(2)


Unnamed: 0,player_id,game_id,game_date,matchup,team_abbreviation,minutes,points,assists,rebounds,steals,blocks,turnovers
0,1641713,22500103,2025-10-25,MEM vs. IND,MEM,2.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1641713,22500094,2025-10-24,MEM vs. MIA,MEM,16.0,5.0,0.0,2.0,1.0,0.0,0.0


## Feature engineering
1. Adding rolling average of basic stats 

In [15]:
# copy for feature engineering
df_features = df_raw.copy()

# adding indicator for home/away based on @/vs symbols
df_features["is_home"] = df_features["matchup"].apply(
    lambda x: 1 if "@" not in x else 0
)

# sort by player and game date
df_features = df_features.sort_values(["player_id", "game_date"])

# calcultae rolling averages for the base stats in last 5 games
df_features["avg_points_last5"] = df_features.groupby("player_id")["points"].transform(
    lambda x: x.rolling(5, min_periods=1).mean().shift(1)
)
df_features["avg_assists_last5"] = df_features.groupby("player_id")[
    "assists"
].transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))
df_features["avg_rebounds_last5"] = df_features.groupby("player_id")[
    "rebounds"
].transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))


In [16]:
df_features.head(5)

Unnamed: 0,player_id,game_id,game_date,matchup,team_abbreviation,minutes,points,assists,rebounds,steals,blocks,turnovers,is_home,avg_points_last5,avg_assists_last5,avg_rebounds_last5
152,2544,22500059,2025-11-25,LAL vs. LAC,LAL,32.0,25.0,6.0,6.0,1.0,1.0,3.0,1,,,
151,2544,22500078,2025-11-28,LAL vs. DAL,LAL,34.0,13.0,7.0,5.0,1.0,0.0,2.0,1,25.0,6.0,6.0
168,2544,22500362,2025-12-07,LAL @ PHI,LAL,34.0,29.0,6.0,7.0,1.0,1.0,2.0,0,19.0,6.5,5.5
167,2544,22501204,2025-12-10,LAL vs. SAS,LAL,36.0,19.0,8.0,15.0,0.0,3.0,3.0,1,22.333333,6.333333,6.0
166,2544,22501228,2025-12-14,LAL @ PHX,LAL,36.0,26.0,4.0,3.0,2.0,2.0,8.0,0,21.5,6.75,8.25


2. Rolling average of teamamte influence (taking only teammate assists for now).

In [17]:
# average assists of other teammates in last 5 games
teammate_avg_assists = []
for idx, row in df_features.iterrows():
    team = row["team_abbreviation"]
    game_date = row["game_date"]
    player_id = row["player_id"]
    # get teammates last 5 games excluding current player
    teammates = df_features[
        (df_features["team_abbreviation"] == team)
        & (df_features["player_id"] != player_id)
        & (df_features["game_date"] < game_date)
    ]
    last5 = teammates.groupby("player_id").tail(5)
    avg_assist = last5["assists"].mean() if not last5.empty else 0
    teammate_avg_assists.append(avg_assist)
df_features["teammate_avg_assists_last5"] = teammate_avg_assists

df_features.head(6)

Unnamed: 0,player_id,game_id,game_date,matchup,team_abbreviation,minutes,points,assists,rebounds,steals,blocks,turnovers,is_home,avg_points_last5,avg_assists_last5,avg_rebounds_last5,teammate_avg_assists_last5
152,2544,22500059,2025-11-25,LAL vs. LAC,LAL,32.0,25.0,6.0,6.0,1.0,1.0,3.0,1,,,,1.761905
151,2544,22500078,2025-11-28,LAL vs. DAL,LAL,34.0,13.0,7.0,5.0,1.0,0.0,2.0,1,25.0,6.0,6.0,1.880952
168,2544,22500362,2025-12-07,LAL @ PHI,LAL,34.0,29.0,6.0,7.0,1.0,1.0,2.0,0,19.0,6.5,5.5,1.904762
167,2544,22501204,2025-12-10,LAL vs. SAS,LAL,36.0,19.0,8.0,15.0,0.0,3.0,3.0,1,22.333333,6.333333,6.0,1.904762
166,2544,22501228,2025-12-14,LAL @ PHX,LAL,36.0,26.0,4.0,3.0,2.0,2.0,8.0,0,21.5,6.75,8.25,1.857143
165,2544,22500379,2025-12-18,LAL @ UTA,LAL,33.0,28.0,10.0,7.0,1.0,0.0,2.0,0,22.4,6.2,7.2,1.857143


3. Opponent strength metrics. 

* Taking points allowed, blocks, steals, turnovers data for now 

In [18]:
# opponent strength (points allowed, blocks, steals, turnovers)
opponent_avg_points_allowed = []
opponent_avg_blocks = []
opponent_avg_steals = []
opponent_avg_turnovers = []

for idx, row in df_features.iterrows():
    matchup = row["matchup"]
    player_team = row["team_abbreviation"]
    if " vs. " in matchup:
        opponent_team = matchup.split(" vs. ")[1]
    else:
        opponent_team = matchup.split(" @ ")[1]

    opp_games = df_features[
        (df_features["team_abbreviation"] == opponent_team)
        & (df_features["game_date"] < row["game_date"])
    ]
    last5 = opp_games.tail(5)

    opponent_avg_points_allowed.append(last5["points"].mean() if not last5.empty else 0)
    opponent_avg_blocks.append(last5["blocks"].mean() if not last5.empty else 0)
    opponent_avg_steals.append(last5["steals"].mean() if not last5.empty else 0)
    opponent_avg_turnovers.append(last5["turnovers"].mean() if not last5.empty else 0)

df_features["opponent_avg_points_allowed_last5"] = opponent_avg_points_allowed
df_features["opponent_avg_blocks_last5"] = opponent_avg_blocks
df_features["opponent_avg_steals_last5"] = opponent_avg_steals
df_features["opponent_avg_turnovers_last5"] = opponent_avg_turnovers

df_features.head(6)

Unnamed: 0,player_id,game_id,game_date,matchup,team_abbreviation,minutes,points,assists,rebounds,steals,...,turnovers,is_home,avg_points_last5,avg_assists_last5,avg_rebounds_last5,teammate_avg_assists_last5,opponent_avg_points_allowed_last5,opponent_avg_blocks_last5,opponent_avg_steals_last5,opponent_avg_turnovers_last5
152,2544,22500059,2025-11-25,LAL vs. LAC,LAL,32.0,25.0,6.0,6.0,1.0,...,3.0,1,,,,1.761905,4.2,0.2,0.4,0.8
151,2544,22500078,2025-11-28,LAL vs. DAL,LAL,34.0,13.0,7.0,5.0,1.0,...,2.0,1,25.0,6.0,6.0,1.880952,11.0,0.2,1.0,1.4
168,2544,22500362,2025-12-07,LAL @ PHI,LAL,34.0,29.0,6.0,7.0,1.0,...,2.0,0,19.0,6.5,5.5,1.904762,14.4,0.4,1.2,1.6
167,2544,22501204,2025-12-10,LAL vs. SAS,LAL,36.0,19.0,8.0,15.0,0.0,...,3.0,1,22.333333,6.333333,6.0,1.904762,1.6,0.0,0.4,0.4
166,2544,22501228,2025-12-14,LAL @ PHX,LAL,36.0,26.0,4.0,3.0,2.0,...,8.0,0,21.5,6.75,8.25,1.857143,1.6,0.0,0.0,0.0
165,2544,22500379,2025-12-18,LAL @ UTA,LAL,33.0,28.0,10.0,7.0,1.0,...,2.0,0,22.4,6.2,7.2,1.857143,14.0,0.2,1.2,1.2


In [19]:
# store the rolling average of minutes as well to avoid data leakage during prediction
# can pass the rolling avg during prediction as real minutes arent known yet
df_features["avg_minutes_last5"] = df_features.groupby("player_id")[
    "minutes"
].transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))

# drop first row for each player with NaN
df_features = df_features.dropna()
df_features.head(6)

Unnamed: 0,player_id,game_id,game_date,matchup,team_abbreviation,minutes,points,assists,rebounds,steals,...,is_home,avg_points_last5,avg_assists_last5,avg_rebounds_last5,teammate_avg_assists_last5,opponent_avg_points_allowed_last5,opponent_avg_blocks_last5,opponent_avg_steals_last5,opponent_avg_turnovers_last5,avg_minutes_last5
151,2544,22500078,2025-11-28,LAL vs. DAL,LAL,34.0,13.0,7.0,5.0,1.0,...,1,25.0,6.0,6.0,1.880952,11.0,0.2,1.0,1.4,32.0
168,2544,22500362,2025-12-07,LAL @ PHI,LAL,34.0,29.0,6.0,7.0,1.0,...,0,19.0,6.5,5.5,1.904762,14.4,0.4,1.2,1.6,33.0
167,2544,22501204,2025-12-10,LAL vs. SAS,LAL,36.0,19.0,8.0,15.0,0.0,...,1,22.333333,6.333333,6.0,1.904762,1.6,0.0,0.4,0.4,33.333333
166,2544,22501228,2025-12-14,LAL @ PHX,LAL,36.0,26.0,4.0,3.0,2.0,...,0,21.5,6.75,8.25,1.857143,1.6,0.0,0.0,0.0,34.0
165,2544,22500379,2025-12-18,LAL @ UTA,LAL,33.0,28.0,10.0,7.0,1.0,...,0,22.4,6.2,7.2,1.857143,14.0,0.2,1.2,1.2,34.4
164,2544,22500395,2025-12-20,LAL @ LAC,LAL,38.0,36.0,3.0,4.0,2.0,...,0,23.0,7.0,7.4,1.880952,4.0,0.2,0.4,0.8,34.6


## Train the model

In [20]:
# features to feed model
features = [
    "avg_minutes_last5",
    "is_home",
    "avg_points_last5",
    "avg_assists_last5",
    "avg_rebounds_last5",
    "teammate_avg_assists_last5",
    "opponent_avg_points_allowed_last5",
    "opponent_avg_blocks_last5",
    "opponent_avg_steals_last5",
    "opponent_avg_turnovers_last5",
]
X = df_features[features]
y = df_features["points"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [21]:
# train xgboost model
model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=12)
model.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


## Save the model to joblib

In [23]:
# save the trained model
joblib.dump(model, "../models/xgb_points_model.pkl")


['../models/xgb_points_model.pkl']