# 03 â€” 2026 Projections + Market Comparison

This notebook:
1. Fits the final model on historical data
2. Applies it to 2026 team-level inputs (aggregated from player projections)
3. Compares outputs to sportsbook win totals (O/U)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# Fit final model on historical data
train = pd.read_csv(r"/mnt/data/fangraphs-leaderboards(21).csv")
train["wins_ps"] = train["wins"] / 3.0

totals_cols = ['G','PA','HR','R','RBI','SB','BsR','Off','Def','WAR','1B','2B','3B']
for c in totals_cols:
    if c in train.columns:
        train[c] = train[c] / 3.0

rename_map = {
    "WAR": "war",
    "ERA": "era",
    "WHIP": "whip",
    "K-BB%": "k-bb",
    "OBP": "obp",
    "R": "runs",
    "SLG": "slg"
}
train = train.rename(columns=rename_map)

features = ["war","era","whip","k-bb","obp","runs","slg"]
X = train[features]
y = train["wins_ps"]

model = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=np.logspace(-3, 3, 100)))
])
model.fit(X, y)


In [None]:
# Load 2026 team-level inputs and predict wins
proj = pd.read_excel(r"/mnt/data/input.xlsx")
proj["proj_wins"] = model.predict(proj[features])

proj.sort_values("proj_wins", ascending=False).head(10)


In [None]:
# Sportsbook 2026 win totals (O/U)
# Replace these values if you want to benchmark against a different book or date.

sportsbook = pd.DataFrame({
    "team": ["LAD","NYY","PHI","TOR","SEA","CHC","ATL","BOS","MIL","DET","NYM","SDP","BAL","HOU","CIN","TEX","SFG","KCR","CLE","ARI","TBR","MIN","CHW","PIT","STL","MIA","WSN","ath","COL","LAA"],
    "sportsbook_ou": [99.5,93.5,92.5,91.5,90.5,88.5,87.5,87.5,87.5,86.5,86.5,85.5,84.5,84.5,83.5,83.5,82.5,81.5,80.5,78.5,78.5,75.5,74.5,71.5,71.5,70.5,69.5,68.5,54.5,62.5]
})

df = proj.merge(sportsbook, on="team", how="inner")

pearson = df["proj_wins"].corr(df["sportsbook_ou"], method="pearson")
spearman = df["proj_wins"].corr(df["sportsbook_ou"], method="spearman")

{"pearson_r": pearson, "spearman_rho": spearman}


In [None]:
# Scatterplot: Model vs Sportsbook
plt.figure(figsize=(7,7))
plt.scatter(df["sportsbook_ou"], df["proj_wins"])

mn = min(df["sportsbook_ou"].min(), df["proj_wins"].min())
mx = max(df["sportsbook_ou"].max(), df["proj_wins"].max())
plt.plot([mn, mx], [mn, mx])

plt.xlabel("Sportsbook Win Total (O/U)")
plt.ylabel("Model Projected Wins")
plt.title("Model Projected Wins vs Sportsbook Win Totals")
plt.tight_layout()
plt.show()


In [None]:
# Difference table (where model disagrees most with market)
df["diff_model_minus_market"] = df["proj_wins"] - df["sportsbook_ou"]

out = df[["team","proj_wins","sportsbook_ou","diff_model_minus_market"]].sort_values(
    "diff_model_minus_market", key=lambda s: s.abs(), ascending=False
)
out


In [None]:
# Export market comparison table
out_path = "model_vs_sportsbook.csv"
out.to_csv(out_path, index=False)
out_path
