imports

In [1]:
import os
os.environ['ODDS_API_KEY'] = '1dc699f4866af6cf86156c64bf5b511b'

In [2]:
# Imports
!pip -q install xgboost==2.0.3 scikit-learn pandas numpy matplotlib
import json, zipfile, io, pandas as pd, numpy as np
from datetime import datetime, timezone
from dateutil.parser import isoparse
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, brier_score_loss
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression as PlattLR
from xgboost import XGBClassifier

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# Models


Run models and save lr_cal, rf_cal, xgb_cal

In [3]:
#  Adjust if betting lines are used to train
USE_BETTING_LINES = True

#  Load dataset
HISTORICAL_CSV = "/content/nba_games_with_odds_2010on.csv"
games = pd.read_csv(HISTORICAL_CSV, parse_dates=["date"])
games = games.sort_values("date").reset_index(drop=True)
games["home_win"] = (games["home_points"] > games["away_points"]).astype(int)

#  Split data
cutoff_date = games["date"].quantile(0.8)
train_games = games[games["date"] < cutoff_date].copy()
valid_games = games[games["date"] >= cutoff_date].copy()
print(f"Train games: {len(train_games)}, Valid games: {len(valid_games)}, cutoff: {cutoff_date.date()}")

#  Rolling features
def make_team_frame(df):
    home = df[["date","home_team","home_points","away_points"]].rename(
        columns={"home_team":"team","home_points":"points_for","away_points":"points_against"}
    ); home["is_home"]=1
    away = df[["date","away_team","away_points","home_points"]].rename(
        columns={"away_team":"team","away_points":"points_for","home_points":"points_against"}
    ); away["is_home"]=0
    team_df = pd.concat([home,away]).sort_values(["team","date"]).reset_index(drop=True)
    team_df["margin"] = team_df["points_for"] - team_df["points_against"]
    return team_df

def add_rolls(team_df, windows=[3,5]):
    grp = team_df.groupby("team", group_keys=False)
    team_df["games_played"] = grp.cumcount()
    for col in ["points_for","points_against","margin"]:
        team_df[f"{col}_exp"] = grp[col].transform(lambda s: s.shift(1).expanding().mean())
    for w in windows:
        team_df[f"pf_{w}g"] = grp["points_for"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        team_df[f"pa_{w}g"] = grp["points_against"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        team_df[f"margin_{w}g"] = grp["margin"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
    team_df["rest_days"] = grp["date"].diff().dt.days
    team_df["b2b"] = (team_df["rest_days"]==1).astype(float)
    return team_df

#  Adds rest/travel
def build_leakproof_features(train_games, valid_games, windows=[3,5]):
    all_games = pd.concat([train_games, valid_games]).sort_values("date").reset_index(drop=True)
    all_games["home_win"] = (all_games["home_points"] > all_games["away_points"]).astype(int)

    team_df = make_team_frame(all_games)
    team_df = add_rolls(team_df, windows=windows)

    H = team_df[team_df["is_home"]==1].add_prefix("H_")
    A = team_df[team_df["is_home"]==0].add_prefix("A_")

    full = (all_games
            .merge(H, left_on=["date","home_team"], right_on=["H_date","H_team"], how="left")
            .merge(A, left_on=["date","away_team"], right_on=["A_date","A_team"], how="left"))

    full["rest_diff"] = full["H_rest_days"] - full["A_rest_days"]
    full["b2b_diff"]  = full["H_b2b"] - full["A_b2b"]
    full["home_on_b2b"] = full["H_b2b"]
    full["away_on_b2b"] = full["A_b2b"]

    num_cols = full.select_dtypes(include=[np.number]).columns.tolist()
    y = full["home_win"].values
    drop_target = ["home_points","away_points","home_win"]
    features = [c for c in num_cols if c not in drop_target]

    valid_start = valid_games["date"].min()
    X_train = full.loc[full["date"] < valid_start, features].replace([np.inf,-np.inf], np.nan)
    y_train = y[full["date"] < valid_start]
    X_valid = full.loc[full["date"] >= valid_start, features].replace([np.inf,-np.inf], np.nan)
    y_valid = y[full["date"] >= valid_start]

    return full, X_train, y_train, X_valid, y_valid

full_base, X_train, y_train, X_valid, y_valid = build_leakproof_features(train_games, valid_games, windows=[3,5])

#  Drop all-NaN columns
nan_cols = X_train.columns[X_train.isna().all()].tolist()
if nan_cols:
    print(f"Dropping {len(nan_cols)} all-NaN columns:", nan_cols[:8], "...")
    X_train = X_train.drop(columns=nan_cols)
    X_valid = X_valid.drop(columns=[c for c in nan_cols if c in X_valid.columns])

# Add or drop bet lines
line_cols = ["spread_close","home_moneyline","away_moneyline"]
if not USE_BETTING_LINES:
    X_train = X_train.drop(columns=[c for c in line_cols if c in X_train.columns], errors="ignore")
    X_valid = X_valid.drop(columns=[c for c in line_cols if c in X_valid.columns], errors="ignore")
    print("Betting lines dropped for evaluation.")
else:
    kept = [c for c in line_cols if c in X_train.columns]
    print("Betting lines kept for production:", kept)

#  Drop direct outcome columns
leak_like = [c for c in X_train.columns if any(k in c.lower() for k in ["margin","points_for","points_against"])]
if leak_like:
    print(f"Dropping {len(leak_like)} direct outcome columns:", leak_like[:10], "...")
    X_train = X_train.drop(columns=leak_like, errors="ignore")
    X_valid = X_valid.drop(columns=[c for c in leak_like if c in X_valid.columns], errors="ignore")
else:
    print("No direct outcome columns found.")

#  Auto-detect any remaining perfect-correlation leaks
for name, X, y in [("train", X_train, y_train), ("valid", X_valid, y_valid)]:
    corrs = pd.concat([pd.Series(y, name="home_win"), X], axis=1)\
               .corr(numeric_only=True)["home_win"].abs().sort_values(ascending=False)
    perfect = [c for c in corrs.index if c != "home_win" and corrs[c] >= 0.9999]
    if perfect:
        print(f"Perfect-correlation leak columns in {name} set:", perfect)
        X_train = X_train.drop(columns=perfect, errors="ignore")
        X_valid = X_valid.drop(columns=perfect, errors="ignore")

Train games: 12691, Valid games: 3175, cutoff: 2020-09-07
Betting lines kept for production: ['spread_close', 'home_moneyline', 'away_moneyline']
Dropping 16 direct outcome columns: ['H_points_for', 'H_points_against', 'H_margin', 'H_points_for_exp', 'H_points_against_exp', 'H_margin_exp', 'H_margin_3g', 'H_margin_5g', 'A_points_for', 'A_points_against'] ...


In [4]:
#  Models
lr_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, n_jobs=-1))
])
rf_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("clf", RandomForestClassifier(
        n_estimators=600, max_depth=None, min_samples_leaf=2,
        random_state=42, n_jobs=-1))
])
xgb_base = XGBClassifier(
    n_estimators=600, learning_rate=0.05, max_depth=6,
    subsample=0.9, colsample_bytree=0.9,
    reg_lambda=1.0, random_state=42, eval_metric="logloss",
    tree_method="hist", use_label_encoder=False
)

#  Calibrate
tscv = TimeSeriesSplit(n_splits=5)
def fit_and_calibrate(estimator, name):
    cal = CalibratedClassifierCV(estimator, method="sigmoid", cv=tscv)
    cal.fit(X_train, y_train)
    print(f"{name} fitted & calibrated.")
    return cal

def fit_xgb_with_platt(model, X, y, cv):
    oof = np.zeros(len(X), dtype=float)
    for tr_idx, va_idx in cv.split(X):
        Xm, Xv, ym, yv = X.iloc[tr_idx], X.iloc[va_idx], y[tr_idx], y[va_idx]
        m = clone(model); m.fit(Xm, ym)
        oof[va_idx] = m.predict_proba(Xv)[:,1]
    platt = PlattLR(max_iter=1000); platt.fit(oof.reshape(-1,1), y)
    model.fit(X, y)
    class CalibratedXGB:
        def __init__(self, base, platt): self.base, self.platt = base, platt
        def predict_proba(self, X):
            p = self.base.predict_proba(X)[:,1]
            pc = self.platt.predict_proba(p.reshape(-1,1))[:,1]
            return np.column_stack([1-pc, pc])
    print("XGBoost fitted & Platt-calibrated.")
    return CalibratedXGB(model, platt)

lr_cal  = fit_and_calibrate(lr_pipe, "Logistic Regression")
rf_cal  = fit_and_calibrate(rf_pipe, "Random Forest")
xgb_cal = fit_xgb_with_platt(xgb_base, X_train, y_train, tscv)

#  Evaluation
def evaluate(model, X, y):
    p = model.predict_proba(X)[:,1]
    pred = (p >= 0.5).astype(int)
    return {
        "accuracy": accuracy_score(y, pred),
        "log_loss": log_loss(y, p, labels=[0,1]),
        "brier": brier_score_loss(y, p),
        "roc_auc": roc_auc_score(y, p),
    }, p

models = [("LR", lr_cal), ("RF", rf_cal), ("XGB", xgb_cal)]
preds = {}
for name, model in models:
    metrics, p = evaluate(model, X_valid, y_valid)
    preds[name] = p
    print(f"\n{name} metrics:")
    for k,v in metrics.items():
        print(f"  {k:>8}: {v:.4f}")

#  Soft-vote ensemble
p_ens = np.mean(list(preds.values()), axis=0)
pred_ens = (p_ens >= 0.5).astype(int)
conf = np.where(pred_ens == 1, p_ens, 1 - p_ens)
ens_metrics = {
    "accuracy": accuracy_score(y_valid, pred_ens),
    "log_loss": log_loss(y_valid, p_ens, labels=[0,1]),
    "brier": brier_score_loss(y_valid, p_ens),
    "roc_auc": roc_auc_score(y_valid, p_ens),
}
print("\nEnsemble metrics:")
for k,v in ens_metrics.items():
    print(f"  {k:>8}: {v:.4f}")

Logistic Regression fitted & calibrated.
Random Forest fitted & calibrated.
XGBoost fitted & Platt-calibrated.

LR metrics:
  accuracy: 0.6595
  log_loss: 0.6284
     brier: 0.2194
   roc_auc: 0.7004

RF metrics:
  accuracy: 0.6419
  log_loss: 0.6378
     brier: 0.2234
   roc_auc: 0.6826

XGB metrics:
  accuracy: 0.5868
  log_loss: 0.6610
     brier: 0.2344
   roc_auc: 0.6668

Ensemble metrics:
  accuracy: 0.6532
  log_loss: 0.6313
     brier: 0.2203
   roc_auc: 0.6997


# Prediction

In [5]:
# === Predictions using new season data (rolling features) +  betting lines for today ===
import pandas as pd, numpy as np

SEASON_CSV   = "/content/2025_season_data.csv"

FIXTURES_CSV = "/content/todays_games.csv"

USE_BETTING_LINES = True

FALLBACK_HISTORICAL_CSV = "/content/nba_games_with_odds_2010on.csv"

#  HELPERS
def _pick(cols, candidates):
    for c in candidates:
        if c in cols: return c
    return None

try:
    make_team_frame
    add_rolls
except NameError:
    def make_team_frame(df):
        home = df[["date","home_team","home_points","away_points"]].rename(
            columns={"home_team":"team","home_points":"points_for","away_points":"points_against"}
        ); home["is_home"]=1
        away = df[["date","away_team","away_points","home_points"]].rename(
            columns={"away_team":"team","away_points":"points_for","home_points":"points_against"}
        ); away["is_home"]=0
        team_df = pd.concat([home,away]).sort_values(["team","date"]).reset_index(drop=True)
        team_df["margin"] = team_df["points_for"] - team_df["points_against"]
        return team_df

    def add_rolls(team_df, windows=[3,5]):
        grp = team_df.groupby("team", group_keys=False)
        team_df["games_played"] = grp.cumcount()
        for col in ["points_for","points_against","margin"]:
            team_df[f"{col}_exp"] = grp[col].transform(lambda s: s.shift(1).expanding().mean())
        for w in windows:
            team_df[f"pf_{w}g"] = grp["points_for"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
            team_df[f"pa_{w}g"] = grp["points_against"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
            team_df[f"margin_{w}g"] = grp["margin"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        team_df["rest_days"] = grp["date"].diff().dt.days
        team_df["b2b"] = (team_df["rest_days"]==1).astype(float)
        return team_df

#  Load historical data
hist = pd.read_csv(FALLBACK_HISTORICAL_CSV, parse_dates=["date"])
# ensure cols exist
assert all(c in hist.columns for c in ["date","home_team","away_team","home_points","away_points"]), \
    f"Historical CSV missing required columns: {hist.columns.tolist()}"

season_raw = pd.read_csv(SEASON_CSV)
#  column mapping for season file
cols = [c.lower() for c in season_raw.columns]
date_col = _pick(season_raw.columns, ["date","game_date"])
home_col = _pick(season_raw.columns, ["home_team","home"])
away_col = _pick(season_raw.columns, ["away_team","away"])
hp_col   = _pick(season_raw.columns, ["home_points","home_pts","home_score"])
ap_col   = _pick(season_raw.columns, ["away_points","away_pts","away_score"])

assert date_col and home_col and away_col and hp_col and ap_col, \
    f"Season CSV must include date/home_team/away_team/home_points/away_points. Found: {season_raw.columns.tolist()}"

season = season_raw.rename(columns={
    date_col:"date", home_col:"home_team", away_col:"away_team",
    hp_col:"home_points", ap_col:"away_points"
})
season["date"] = pd.to_datetime(season["date"])
season = season.sort_values("date").reset_index(drop=True)

# merge: append season results to hist, drop duplicates
hist = pd.concat([hist, season], ignore_index=True)
hist = hist.sort_values("date").drop_duplicates(subset=["date","home_team","away_team"], keep="last").reset_index(drop=True)

team_df_all = make_team_frame(hist)
team_df_all = add_rolls(team_df_all, windows=[3,5])
H_all = team_df_all[team_df_all["is_home"]==1].add_prefix("H_")
A_all = team_df_all[team_df_all["is_home"]==0].add_prefix("A_")

#  Load fixtures
fixtures = pd.read_csv(FIXTURES_CSV)
date_col = _pick(fixtures.columns, ["date","game_date"])
home_col = _pick(fixtures.columns, ["home_team","home"])
away_col = _pick(fixtures.columns, ["away_team","away"])
assert date_col and home_col and away_col, \
    f"Fixtures CSV must include date/home_team/away_team. Found: {fixtures.columns.tolist()}"

fixtures = fixtures.rename(columns={date_col:"date", home_col:"home_team", away_col:"away_team"})
fixtures["date"] = pd.to_datetime(fixtures["date"])
fixtures = fixtures.sort_values("date").reset_index(drop=True)

maybe_lines = {}
for c in ["spread_close","home_moneyline","away_moneyline"]:
    if c in fixtures.columns:
        maybe_lines[c] = c

# Build features
records = []
missing_hist = set()

for _, row in fixtures.iterrows():
    d, h, a = row["date"], str(row["home_team"]), str(row["away_team"])
    H_snap = H_all[(H_all["H_team"]==h) & (H_all["H_date"] < d)].sort_values("H_date").tail(1)
    A_snap = A_all[(A_all["A_team"]==a) & (A_all["A_date"] < d)].sort_values("A_date").tail(1)

    if H_snap.empty: missing_hist.add(h)
    if A_snap.empty: missing_hist.add(a)

    if H_snap.empty:
        H_snap = pd.DataFrame([{c: np.nan for c in H_all.columns}]); H_snap["H_team"]=h; H_snap["H_date"]=pd.NaT
    if A_snap.empty:
        A_snap = pd.DataFrame([{c: np.nan for c in A_all.columns}]); A_snap["A_team"]=a; A_snap["A_date"]=pd.NaT

    base = pd.DataFrame([
        {
            "date": d, "home_team": h, "away_team": a,
            **{k: row[k] for k in maybe_lines}
        }
    ])

    merged = (base
              .merge(H_snap, how="left", left_on=["home_team"], right_on=["H_team"])
              .merge(A_snap, how="left", left_on=["away_team"], right_on=["A_team"]))

    # derived rest/travel features
    merged["rest_diff"]   = merged["H_rest_days"] - merged["A_rest_days"]
    merged["b2b_diff"]    = merged["H_b2b"] - merged["A_b2b"]
    merged["home_on_b2b"] = merged["H_b2b"]
    merged["away_on_b2b"] = merged["A_b2b"]
    records.append(merged)

pred_base = pd.concat(records, ignore_index=True)

if missing_hist:
    print(f"Note: no recent history for {len(missing_hist)} team(s): {sorted(missing_hist)}; features will be imputed.")

assert 'X_train' in globals(), "Expected X_train from training cell for feature alignment."
feat_cols = X_train.columns.tolist()
X_pred = pred_base.copy()

# drop outcome-like fields if present
drop_like = [c for c in X_pred.columns if any(k in c.lower() for k in ["margin","points_for","points_against","home_points","away_points","home_win"])]
if drop_like: X_pred = X_pred.drop(columns=drop_like, errors="ignore")

# include/exclude betting lines to match training usage
if not USE_BETTING_LINES:
    for c in ["spread_close","home_moneyline","away_moneyline"]:
        if c in X_pred.columns:
            X_pred = X_pred.drop(columns=c)

# add any missing training columns as NaN
for col in feat_cols:
    if col not in X_pred.columns:
        X_pred[col] = np.nan
X_pred = X_pred[feat_cols].replace([np.inf,-np.inf], np.nan)


In [6]:
# Predict using fitted models
assert all(v in globals() for v in ["lr_cal","rf_cal","xgb_cal"]), \
    "Run the training cell first to create lr_cal, rf_cal, xgb_cal."

p_lr  = lr_cal.predict_proba(X_pred)[:,1]
p_rf  = rf_cal.predict_proba(X_pred)[:,1]
p_xgb = xgb_cal.predict_proba(X_pred)[:,1]
p_ens = np.mean([p_lr, p_rf, p_xgb], axis=0)

pred_home = (p_ens >= 0.5).astype(int)
confidence = np.where(pred_home==1, p_ens, 1 - p_ens)

#  Output
out = pred_base[["date","home_team","away_team"]].copy()

# model probabilities
out["prob_home_LR"]  = p_lr
out["prob_home_RF"]  = p_rf
out["prob_home_XGB"] = p_xgb
out["prob_home_ens"] = p_ens

# predicted winner + confidence
out["predicted_winner"] = np.where(pred_home==1, out["home_team"], out["away_team"])
out["confidence"] = confidence

# add moneyline for predicted winner
if "home_moneyline" in pred_base.columns and "away_moneyline" in pred_base.columns:
    out["winner_moneyline"] = np.where(
        pred_home==1,
        pred_base["home_moneyline"],
        pred_base["away_moneyline"]
    )
else:
    out["winner_moneyline"] = np.nan  # fill if odds not provided


pd.options.display.float_format = "{:.3f}".format
out = out.sort_values(["date","home_team"]).reset_index(drop=True)

print("\nPredictions:")
print(out[["date","home_team","away_team",
           "prob_home_LR", "prob_home_RF", "prob_home_XGB",
           "predicted_winner","confidence","winner_moneyline",
           "prob_home_ens"]])

SAVE_PATH = "/content/todays_predictions.csv"
out.to_csv(SAVE_PATH, index=False)
print(f"\nSaved predictions (with moneylines) to {SAVE_PATH}")


Predictions:
        date               home_team              away_team  prob_home_LR  \
0 2025-11-17     Cleveland Cavaliers        Milwaukee Bucks         0.610   
1 2025-11-17          Denver Nuggets          Chicago Bulls         0.853   
2 2025-11-17         Detroit Pistons         Indiana Pacers         0.734   
3 2025-11-17              Miami Heat        New York Knicks         0.501   
4 2025-11-17  Minnesota Timberwolves       Dallas Mavericks         0.835   
5 2025-11-17    New Orleans Pelicans  Oklahoma City Thunder         0.102   
6 2025-11-17      Philadelphia 76ers   Los Angeles Clippers         0.720   
7 2025-11-17         Toronto Raptors      Charlotte Hornets         0.778   

   prob_home_RF  prob_home_XGB        predicted_winner  confidence  \
0         0.585          0.675     Cleveland Cavaliers       0.623   
1         0.686          0.687          Denver Nuggets       0.742   
2         0.626          0.658         Detroit Pistons       0.673   
3         0.