# 02 â€” Model Training & Validation

This notebook trains an interpretable win-total model and evaluates performance using out-of-sample cross-validation.

**Model**: Ridge Regression (regularized linear regression)

Ridge regression helps stabilize estimates when features are correlated (e.g., ERA, WHIP, K-BB%).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# Load and prep historical data
train = pd.read_csv(r"/mnt/data/fangraphs-leaderboards(21).csv")

train["wins_ps"] = train["wins"] / 3.0
totals_cols = ['G','PA','HR','R','RBI','SB','BsR','Off','Def','WAR','1B','2B','3B']
for c in totals_cols:
    if c in train.columns:
        train[c] = train[c] / 3.0

rename_map = {
    "WAR": "war",
    "ERA": "era",
    "WHIP": "whip",
    "K-BB%": "k-bb",
    "OBP": "obp",
    "R": "runs",
    "SLG": "slg"
}
train = train.rename(columns=rename_map)

features = ["war","era","whip","k-bb","obp","runs","slg"]
X = train[features]
y = train["wins_ps"]

model = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=np.logspace(-3, 3, 100)))
])

model


In [None]:
# 5-fold cross-validated predictions
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(model, X, y, cv=cv)

mae = mean_absolute_error(y, y_pred)
rmse = mean_squared_error(y, y_pred, squared=False)
r2 = r2_score(y, y_pred)

{"MAE": mae, "RMSE": rmse, "R2": r2}


In [None]:
# Residual check (actual - predicted)
resid = y - y_pred

plt.figure(figsize=(7,4))
plt.scatter(y_pred, resid)
plt.axhline(0, color="black", linewidth=0.8)
plt.title("Residuals vs Predicted Wins (CV)")
plt.xlabel("Predicted wins (per season)")
plt.ylabel("Residual (actual - predicted)")
plt.tight_layout()
plt.show()

resid.describe()


In [None]:
# Fit final model on all training data (for projections)
model.fit(X, y)
