# NBA Match Predictor V5 (HistGradientBoosting)

This notebook implements the final 'Matchup Merge' model with 61% accuracy.

### Improvements over V2 (Ridge):
1. **Matchup Merge**: Joins Team OFF vs Opponent OFF/DEF stats.
2. **HistGradientBoosting**: Non-linear model handles complex interactions.
3. **Dynamic Selection**: Picks top 50 features automatically.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import joblib
import os

DATA_PATH = "../data/nba_games_raw.csv"
MODEL_DIR = "../models/hist_gbm_v5"
if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR)

In [None]:
# 1. Load & Clean Data
df = pd.read_csv(DATA_PATH)
df = df.sort_values("date")

# Drop internal/useless columns
if "mp" in df.columns: del df["mp"]
if "mp.1" in df.columns: del df["mp.1"]
if "item" in df.columns: del df["item"]

# Drop columns with nulls (Essential!)
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
valid_cols = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_cols].copy()

print(f"Cleaned Data: {df.shape}")

In [None]:
# 2. Add Targets & Next Game Info
def add_target(team_df):
    team_df["target"] = team_df["won"].shift(-1)
    team_df["home_next"] = team_df["home"].shift(-1)
    team_df["team_opp_next"] = team_df["team_opp"].shift(-1)
    team_df["date_next"] = team_df["date"].shift(-1)
    return team_df

df = df.groupby("team", group_keys=False).apply(add_target)

# Drop rows where next game info is NaN
df = df.dropna(subset=["target", "home_next", "team_opp_next", "date_next"])
df["target"] = df["target"].astype(int)
print(f"Data with Targets: {df.shape}")

In [None]:
# 3. Compute Rolling Averages (Last 10 Games)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in ["season", "target", "won", "team", "team_opp", "date", "home_next", "team_opp_next", "date_next"]:
    if c in numeric_cols: numeric_cols.remove(c)

def find_team_averages(team):
    rolling = team[numeric_cols].rolling(10).mean()
    return rolling

df_rolling = df.groupby(["team"], group_keys=False).apply(find_team_averages)
roll_cols = [f"{c}_10" for c in df_rolling.columns]
df_rolling.columns = roll_cols

# Concatenate
df = pd.concat([df, df_rolling], axis=1)
df = df.dropna()
print(f"Data with Rolling Stats: {df.shape}")

In [None]:
# 4. The Matchup Merge (Team vs Opponent)
# We match Team A (Left) with Team B (Right) on Date + Opponent ID
stats_cols = roll_cols + ["team", "date_next"]
right_df = df[stats_cols].copy()

combined = pd.merge(
    df, right_df, 
    left_on=["team_opp_next", "date_next"], 
    right_on=["team", "date_next"],
    suffixes=("_team", "_opp")
)
combined = combined.dropna()
print(f"Final Merged Data Shape: {combined.shape}")

In [None]:
# 5. Feature Selection (Top 50)
predictors = [c for c in combined.columns if "_10" in c or c == "home_next"]

# Split Last 15% as Test
split_idx = int(len(combined) * 0.85)
train_df = combined.iloc[:split_idx]
test_df = combined.iloc[split_idx:]

X_train = train_df[predictors]
y_train = train_df["target"]
X_test = test_df[predictors]
y_test = test_df["target"]

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Gradient Boosting for Feature Importance
print("Selecting Best Features...")
sel = GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=42)
sel.fit(X_train_scaled, y_train)

importances = sel.feature_importances_
top_k = 50
top_indices = np.argsort(importances)[::-1][:top_k]
selected_predictors = [predictors[i] for i in top_indices]

print(f"Selected {len(selected_predictors)} Predictors.")
print(selected_predictors[:10])

In [None]:
# 6. Final Training: HistGradientBoosting
scaler_final = MinMaxScaler()
X_train_final = scaler_final.fit_transform(train_df[selected_predictors])
X_test_final = scaler_final.transform(test_df[selected_predictors])

model = HistGradientBoostingClassifier(max_iter=100, max_depth=5, learning_rate=0.1, random_state=42)
model.fit(X_train_final, y_train)

preds = model.predict(X_test_final)
acc = accuracy_score(y_test, preds)
print(f"Validation Accuracy: {acc:.4f}")

In [None]:
# 7. Save Artifacts
joblib.dump(model, f"{MODEL_DIR}/model_v5.pkl")
joblib.dump(scaler_final, f"{MODEL_DIR}/scaler_v5.pkl")
joblib.dump(selected_predictors, f"{MODEL_DIR}/predictors_v5.pkl")
print("Model saved!")