# 04 — Final Model Training & Winner Prediction

This notebook trains and evaluates the **final NBA team points prediction model**  
using the cleaned dataset (`data/processed/team_games_clean.csv`).

It:

1. Loads processed game-level team data.
2. Performs a chronological 80/20 train–test split.
3. Trains a **Linear Regression** model to predict team points (`pts`).
4. Compares performance to a **team-average baseline** using MAE and RMSE.
5. Converts score predictions into **game winner predictions**.
6. Compares winner accuracy to a **home-team baseline**.

The goal is to provide a minimal, reproducible pipeline for the final report.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

import joblib

# Paths relative to this notebook (living in notebooks/)
CLEAN_PATH = Path("../data/processed/team_games_clean.csv")

MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(exist_ok=True)
MODEL_PATH = MODEL_DIR / "final_linear_regression.joblib"

PRED_PATH = Path("../data/processed/test_predictions.csv")

print("Cleaned data path:", CLEAN_PATH)


Cleaned data path: ../data/processed/team_games_clean.csv


In [None]:
df = pd.read_csv(CLEAN_PATH, parse_dates=["game_date"])
print("Loaded:", CLEAN_PATH, "| shape:", df.shape)
print("Columns:", df.columns.tolist())

# Optional: peek at the first few rows
df.head()
sorted(df["team_abbreviation"].unique())


Loaded: ../data/processed/team_games_clean.csv | shape: (2460, 35)
Columns: ['season_id', 'team_id', 'team_abbreviation', 'team_name', 'game_id', 'game_date', 'matchup', 'wl', 'pts', 'fg_pct', 'fga', 'fgm', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'plus_minus', 'home', 'rest_days', 'opponent_team_id', 'opponent_team_abbreviation', 'opponent_pts', 'opponent_fg_pct', 'opponent_reb', 'opponent_tov']


['ATL',
 'BKN',
 'BOS',
 'CHA',
 'CHI',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GSW',
 'HOU',
 'IND',
 'LAC',
 'LAL',
 'MEM',
 'MIA',
 'MIL',
 'MIN',
 'NOP',
 'NYK',
 'OKC',
 'ORL',
 'PHI',
 'PHX',
 'POR',
 'SAC',
 'SAS',
 'TOR',
 'UTA',
 'WAS']

In [None]:
TARGET = "pts"

CANDIDATE_FEATURES = [
    "home", "rest_days",
    "fg_pct", "fga", "fgm",
    "reb", "ast", "tov",
    "opponent_pts", "opponent_fg_pct", "opponent_reb", "opponent_tov",
]

# Chronological sort
df = df.sort_values("game_date").reset_index(drop=True)

split_idx = int(len(df) * 0.8)
train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

print(f"Train: {train_df.shape} | Test: {test_df.shape}")
print(f"Train date range: {train_df['game_date'].min()} → {train_df['game_date'].max()}")
print(f"Test date range:  {test_df['game_date'].min()} → {test_df['game_date'].max()}")


Train: (1968, 35) | Test: (492, 35)
Train date range: 2023-10-24 00:00:00 → 2024-03-14 00:00:00
Test date range:  2024-03-14 00:00:00 → 2024-04-14 00:00:00


In [None]:
print("Missing values in candidate features (train):")
print(train_df[CANDIDATE_FEATURES].isna().sum())
print()

# Drop rows with missing values in features (same strategy as 03)
X_train = train_df[CANDIDATE_FEATURES].dropna()
y_train = train_df.loc[X_train.index, TARGET]

X_test = test_df[CANDIDATE_FEATURES].dropna()
y_test = test_df.loc[X_test.index, TARGET]

print(f"Training samples: {len(X_train)}")
print(f"Test samples:     {len(X_test)}")
print(f"Features used:    {CANDIDATE_FEATURES}")


Missing values in candidate features (train):
home               0
rest_days          0
fg_pct             0
fga                0
fgm                0
reb                0
ast                0
tov                0
opponent_pts       0
opponent_fg_pct    0
opponent_reb       0
opponent_tov       0
dtype: int64

Training samples: 1968
Test samples:     492
Features used:    ['home', 'rest_days', 'fg_pct', 'fga', 'fgm', 'reb', 'ast', 'tov', 'opponent_pts', 'opponent_fg_pct', 'opponent_reb', 'opponent_tov']


In [None]:
# Baseline: team-season-average points, computed on training data only
team_avg = train_df.groupby("team_abbreviation")[TARGET].mean().to_dict()

test_with_baseline = test_df.loc[y_test.index].copy()
test_with_baseline["baseline_pred"] = test_with_baseline["team_abbreviation"].map(team_avg)

baseline_mae = mean_absolute_error(test_with_baseline[TARGET], test_with_baseline["baseline_pred"])
baseline_rmse = np.sqrt(mean_squared_error(test_with_baseline[TARGET], test_with_baseline["baseline_pred"]))

print("=== BASELINE MODEL (Team Season Average) ===")
print(f"MAE:  {baseline_mae:.2f}")
print(f"RMSE: {baseline_rmse:.2f}")


=== BASELINE MODEL (Team Season Average) ===
MAE:  10.15
RMSE: 12.54


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

lr_mae = mean_absolute_error(y_test, y_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("=== FINAL MODEL: Linear Regression ===")
print(f"MAE:  {lr_mae:.2f}")
print(f"RMSE: {lr_rmse:.2f}")
print()
print(f"Improvement in MAE vs baseline:  {baseline_mae - lr_mae:.2f} points")
print(f"Improvement in RMSE vs baseline: {baseline_rmse - lr_rmse:.2f} points")


=== FINAL MODEL: Linear Regression ===
MAE:  4.08
RMSE: 5.14

Improvement in MAE vs baseline:  6.07 points
Improvement in RMSE vs baseline: 7.39 points


In [None]:
# Attach predictions back to a clean test subset
test_pred_df = test_df.loc[y_test.index].copy()
test_pred_df["predicted_pts"] = y_pred

required_cols = ["game_id", "team_abbreviation", "home", TARGET, "predicted_pts"]
missing = [c for c in required_cols if c not in test_pred_df.columns]
if missing:
    print("WARNING: Missing columns needed for winner prediction:", missing)
else:
    display(test_pred_df[required_cols].head())


Unnamed: 0,game_id,team_abbreviation,home,pts,predicted_pts
1968,22300959,DAL,0,119,117.440002
1969,22300958,PHI,0,105,106.163445
1970,22300958,MIL,1,114,111.710105
1971,22300960,POR,1,93,94.198519
1972,22300957,HOU,1,135,134.481287


In [None]:
if not missing:
    # Actual winners: team with highest ACTUAL pts in each game
    actual_winners = (
        test_pred_df
        .groupby("game_id")
        .apply(lambda g: g.loc[g[TARGET].idxmax(), "team_abbreviation"])
        .rename("actual_winner")
    )

    # Predicted winners: team with highest PREDICTED pts in each game
    predicted_winners = (
        test_pred_df
        .groupby("game_id")
        .apply(lambda g: g.loc[g["predicted_pts"].idxmax(), "team_abbreviation"])
        .rename("predicted_winner")
    )

    winner_df = pd.concat([actual_winners, predicted_winners], axis=1)
    winner_accuracy = (winner_df["actual_winner"] == winner_df["predicted_winner"]).mean()

    print("=== WINNER PREDICTION (Score-based) ===")
    print(f"Accuracy: {winner_accuracy:.3f}")


=== WINNER PREDICTION (Score-based) ===
Accuracy: 0.818


  .apply(lambda g: g.loc[g[TARGET].idxmax(), "team_abbreviation"])
  .apply(lambda g: g.loc[g["predicted_pts"].idxmax(), "team_abbreviation"])


In [None]:
if not missing:
    # Baseline winner: always pick home team
    baseline_home_winner = (
        test_pred_df
        .groupby("game_id")
        .apply(lambda g: g.loc[g["home"].idxmax(), "team_abbreviation"])
        .rename("baseline_winner")
    )

    baseline_accuracy = (baseline_home_winner == actual_winners).mean()

    print("=== BASELINE WINNER PREDICTION (Home Team) ===")
    print(f"Accuracy: {baseline_accuracy:.3f}")

    print("\nComparison:")
    print(f"Score-based model accuracy: {winner_accuracy:.3f}")
    print(f"Home-team baseline accuracy: {baseline_accuracy:.3f}")


=== BASELINE WINNER PREDICTION (Home Team) ===
Accuracy: 0.522

Comparison:
Score-based model accuracy: 0.818
Home-team baseline accuracy: 0.522


  .apply(lambda g: g.loc[g["home"].idxmax(), "team_abbreviation"])


In [None]:
# Save test predictions
if not missing:
    out_cols = ["game_id", "team_abbreviation", "home", TARGET, "predicted_pts"]
    preds_out = test_pred_df[out_cols].copy()
    PRED_PATH.parent.mkdir(parents=True, exist_ok=True)
    preds_out.to_csv(PRED_PATH, index=False)
    print("Saved test predictions to:", PRED_PATH)

# Save trained model
joblib.dump(lr, MODEL_PATH)
print("Saved trained Linear Regression model to:", MODEL_PATH)


Saved test predictions to: ../data/processed/test_predictions.csv
Saved trained Linear Regression model to: ../models/final_linear_regression.joblib


In [None]:
print("=" * 60)
print("FINAL MODEL SUMMARY")
print("=" * 60)
print(f"Target: {TARGET}")
print(f"Train samples: {len(X_train)}")
print(f"Test samples:  {len(X_test)}")
print()
print("Regression performance (test set):")
print(f"- Baseline MAE / RMSE: {baseline_mae:.2f} / {baseline_rmse:.2f}")
print(f"- Linear  MAE / RMSE: {lr_mae:.2f} / {lr_rmse:.2f}")
print()
if not missing:
    print("Winner prediction:")
    print(f"- Score-based model accuracy: {winner_accuracy:.3f}")
    print(f"- Home-team baseline accuracy: {baseline_accuracy:.3f}")
print("=" * 60)


FINAL MODEL SUMMARY
Target: pts
Train samples: 1968
Test samples:  492

Regression performance (test set):
- Baseline MAE / RMSE: 10.15 / 12.54
- Linear  MAE / RMSE: 4.08 / 5.14

Winner prediction:
- Score-based model accuracy: 0.818
- Home-team baseline accuracy: 0.522
