# 03 — Baseline Regression (Predict Team Points)

**Target:** `pts`  
**Split:** chronological (first 80% train → last 20% test)  
**Metrics:** MAE, RMSE vs. baseline (season average)

**Plan:**
1. Select features (e.g., `home`, `rest_days`, `fg_pct`, `reb`, `tov`, `opponent_*`)
2. Chronological split
3. Baselines (team season avg) + Linear Regression + Ridge
4. Evaluate & plot Predicted vs Actual


In [1]:
# Setup & data load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

CLEAN_PATH = Path("../data/processed/team_games_clean.csv")
IMG_DIR = Path("../img"); IMG_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(CLEAN_PATH, parse_dates=["game_date"])
print("Loaded:", CLEAN_PATH, "| shape:", df.shape)

# Target and features
TARGET = "pts"
CANDIDATE_FEATURES = [
    "home", "rest_days",
    "fg_pct", "fga", "fgm",
    "reb", "ast", "tov",
    "opponent_pts", "opponent_fg_pct", "opponent_reb", "opponent_tov",
]

# Chronological 80/20 split
df = df.sort_values("game_date").reset_index(drop=True)
split_idx = int(len(df) * 0.8)
train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]

print(f"Train: {train_df.shape} | Test: {test_df.shape}")
print(f"Train date range: {train_df['game_date'].min()} to {train_df['game_date'].max()}")
print(f"Test date range: {test_df['game_date'].min()} to {test_df['game_date'].max()}")

ModuleNotFoundError: No module named 'sklearn'

## 1) Baseline Model (Team Season Average)

In [None]:
# Calculate team season average from training data
team_avg = train_df.groupby('team_abbreviation')['pts'].mean().to_dict()

# Predict using team averages
test_df['baseline_pred'] = test_df['team_abbreviation'].map(team_avg)

# Evaluate baseline
baseline_mae = mean_absolute_error(test_df['pts'], test_df['baseline_pred'])
baseline_rmse = np.sqrt(mean_squared_error(test_df['pts'], test_df['baseline_pred']))

print("=== BASELINE MODEL (Team Season Average) ===")
print(f"MAE: {baseline_mae:.2f}")
print(f"RMSE: {baseline_rmse:.2f}")
print()
print("Sample predictions:")
print(test_df[['team_abbreviation', 'pts', 'baseline_pred']].head(10))

## 2) Prepare Features for ML Models

In [None]:
# Check for missing values in features
print("Missing values in candidate features:")
print(train_df[CANDIDATE_FEATURES].isna().sum())
print()

# Drop rows with missing values
X_train = train_df[CANDIDATE_FEATURES].dropna()
y_train = train_df.loc[X_train.index, TARGET]

X_test = test_df[CANDIDATE_FEATURES].dropna()
y_test = test_df.loc[X_test.index, TARGET]

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"\nFeatures used: {CANDIDATE_FEATURES}")

## 3) Linear Regression Model

In [None]:
# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred_lr = lr.predict(X_test)

# Evaluate
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print("=== LINEAR REGRESSION ===")
print(f"MAE: {lr_mae:.2f}")
print(f"RMSE: {lr_rmse:.2f}")
print(f"Improvement over baseline MAE: {baseline_mae - lr_mae:.2f} points")
print()

# Feature importance
feature_importance = pd.DataFrame({
    'feature': CANDIDATE_FEATURES,
    'coefficient': lr.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("Top feature coefficients:")
print(feature_importance.head(10))

## 4) Ridge Regression Model

In [None]:
# Train Ridge Regression with alpha=1.0
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Predictions
y_pred_ridge = ridge.predict(X_test)

# Evaluate
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))

print("=== RIDGE REGRESSION (alpha=1.0) ===")
print(f"MAE: {ridge_mae:.2f}")
print(f"RMSE: {ridge_rmse:.2f}")
print(f"Improvement over baseline MAE: {baseline_mae - ridge_mae:.2f} points")

## 5) Model Comparison

In [None]:
# Create comparison table
results = pd.DataFrame({
    'Model': ['Baseline (Team Avg)', 'Linear Regression', 'Ridge Regression'],
    'MAE': [baseline_mae, lr_mae, ridge_mae],
    'RMSE': [baseline_rmse, lr_rmse, ridge_rmse]
}).round(2)

print("=== MODEL COMPARISON ===")
print(results)
print()

# Best model
best_model_idx = results['MAE'].idxmin()
print(f"Best model: {results.loc[best_model_idx, 'Model']}")
print(f"Best MAE: {results.loc[best_model_idx, 'MAE']:.2f}")

# Visualize comparison
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.bar(results['Model'], results['MAE'], color=['gray', 'skyblue', 'lightcoral'])
plt.title('Mean Absolute Error (MAE)')
plt.ylabel('Points')
plt.xticks(rotation=15, ha='right')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.bar(results['Model'], results['RMSE'], color=['gray', 'skyblue', 'lightcoral'])
plt.title('Root Mean Squared Error (RMSE)')
plt.ylabel('Points')
plt.xticks(rotation=15, ha='right')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(IMG_DIR / 'model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 6) Predicted vs Actual Visualization

In [None]:
# Plot predicted vs actual for Linear Regression
plt.figure(figsize=(12, 5))

# Linear Regression
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_lr, alpha=0.5, s=20)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Points')
plt.ylabel('Predicted Points')
plt.title(f'Linear Regression\nMAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}')
plt.grid(True, alpha=0.3)

# Ridge Regression
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred_ridge, alpha=0.5, s=20, color='coral')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Points')
plt.ylabel('Predicted Points')
plt.title(f'Ridge Regression\nMAE: {ridge_mae:.2f}, RMSE: {ridge_rmse:.2f}')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(IMG_DIR / 'predicted_vs_actual.png', dpi=150, bbox_inches='tight')
plt.show()

## 7) Error Analysis

In [None]:
# Calculate prediction errors for best model (Linear Regression)
errors = y_test - y_pred_lr
test_df_clean = test_df.loc[y_test.index].copy()
test_df_clean['prediction'] = y_pred_lr
test_df_clean['error'] = errors

# Error distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(errors, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Prediction Error (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('Distribution of Prediction Errors')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.grid(True, alpha=0.3)

# Error by team
plt.subplot(1, 2, 2)
team_errors = test_df_clean.groupby('team_abbreviation')['error'].mean().sort_values()
plt.barh(range(len(team_errors)), team_errors.values)
plt.yticks(range(len(team_errors)), team_errors.index, fontsize=8)
plt.xlabel('Average Prediction Error')
plt.title('Average Error by Team')
plt.axvline(x=0, color='red', linestyle='--', linewidth=1)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(IMG_DIR / 'error_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("Error statistics:")
print(f"Mean error: {errors.mean():.2f}")
print(f"Std error: {errors.std():.2f}")
print(f"Max overestimation: {errors.min():.2f}")
print(f"Max underestimation: {errors.max():.2f}")

## 8) Summary and Conclusions

In [None]:
print("=" * 60)
print("REGRESSION MODELING SUMMARY")
print("=" * 60)
print()
print(f"Target: Predict team points (pts)")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features used: {len(CANDIDATE_FEATURES)}")
print()
print("Model Performance (Test Set):")
print("-" * 60)
print(results.to_string(index=False))
print()
print("Key Findings:")
print(f"1. Both ML models outperform the baseline by reducing MAE")
print(f"2. Linear Regression MAE: {lr_mae:.2f} (improvement: {baseline_mae - lr_mae:.2f} points)")
print(f"3. Ridge Regression MAE: {ridge_mae:.2f} (improvement: {baseline_mae - ridge_mae:.2f} points)")
print()
print("Top predictive features:")
print(feature_importance.head(5).to_string(index=False))
print()
print("Next Steps:")
print("- Could explore Random Forest or Gradient Boosting models")
print("- Feature engineering: rolling averages, opponent strength metrics")
print("- Use predictions to derive game winners (classification)")
print("=" * 60)