# NBA Predictions — Optimized Notebook

In [None]:

# === 1) Setup & Install (minimal) ===
!pip -q install nba_api pytz xgboost==2.0.3 scikit-learn pandas numpy joblib


In [None]:

# === 2) Configuration ===
import os

# Historical merged games CSV (2010+), must include columns:
# date, home_team, away_team, home_points, away_points
HIST_CSV = "/content/nba_games_with_odds_2010on.csv"  # change if needed

# Team-only (no sportsbook lines) artifacts
TEAM_FEATS_CSV = "/content/team_only_feature_list.csv"
TEAM_LR_PATH   = "/content/team_only_lr_cal.joblib"
TEAM_RF_PATH   = "/content/team_only_rf_cal.joblib"
TEAM_XGB_PATH  = "/content/team_only_xgb_cal.joblib"

# With-lines (market-aware) artifacts
LINES_FEATS_CSV = "/content/with_lines_orflag_feature_list.csv"
LINES_LR_PATH   = "/content/with_lines_orflag_lr_cal.joblib"
LINES_RF_PATH   = "/content/with_lines_orflag_rf_cal.joblib"
LINES_XGB_PATH  = "/content/with_lines_orflag_xgb_cal.joblib"

# Odds API (optional for live odds)
# Set once: os.environ['ODDS_API_KEY'] = 'YOUR_KEY'
ODDS_API_KEY = os.getenv("ODDS_API_KEY", "").strip()

# Output files
OUT_FIXTURES_CSV = "/content/todays_games.csv"
OUT_TEAM_CSV     = "/content/preds_team_only.csv"
OUT_LINES_CSV    = "/content/preds_with_lines.csv"
OUT_HYBRID_CSV   = "/content/preds_hybrid.csv"

# Hybrid blend weight (0..1). final_prob = (1-α)*team + α*with_lines
HYBRID_ALPHA   = 0.60
TEAM_THRESHOLD = 0.50

# Cache path for precomputed team snapshots (speeds up daily runs)
CACHE_TEAM_PARQUET = "/content/cache_team_all.parquet"


In [None]:

# === 3) Imports & Helpers ===
import pandas as pd, numpy as np, joblib, pytz, requests
from datetime import datetime, timezone, timedelta
from sklearn.base import BaseEstimator, ClassifierMixin
from nba_api.live.nba.endpoints import scoreboard as live_scoreboard

pd.options.display.float_format = "{:.3f}".format

# ---- Odds/prob helpers ----
def implied_prob(ml):
    ml = pd.to_numeric(ml, errors="coerce")
    return np.where(ml < 0, -ml / (-ml + 100), 100 / (ml + 100))

def moneyline_to_decimal(ml):
    ml = pd.to_numeric(ml, errors="coerce")
    return np.where(ml < 0, 1 + 100/(-ml), 1 + ml/100)

# ---- Team name mapping ----
TEAM_ALIASES = {
    "Atlanta Hawks":"Atlanta Hawks","Boston Celtics":"Boston Celtics","Brooklyn Nets":"Brooklyn Nets",
    "Charlotte Hornets":"Charlotte Hornets","Chicago Bulls":"Chicago Bulls","Cleveland Cavaliers":"Cleveland Cavaliers",
    "Dallas Mavericks":"Dallas Mavericks","Denver Nuggets":"Denver Nuggets","Detroit Pistons":"Detroit Pistons",
    "Golden State Warriors":"Golden State Warriors","Houston Rockets":"Houston Rockets","Indiana Pacers":"Indiana Pacers",
    "Los Angeles Clippers":"Los Angeles Clippers","Los Angeles Lakers":"Los Angeles Lakers","Memphis Grizzlies":"Memphis Grizzlies",
    "Miami Heat":"Miami Heat","Milwaukee Bucks":"Milwaukee Bucks","Minnesota Timberwolves":"Minnesota Timberwolves",
    "New Orleans Pelicans":"New Orleans Pelicans","New York Knicks":"New York Knicks","Oklahoma City Thunder":"Oklahoma City Thunder",
    "Orlando Magic":"Orlando Magic","Philadelphia 76ers":"Philadelphia 76ers","Phoenix Suns":"Phoenix Suns",
    "Portland Trail Blazers":"Portland Trail Blazers","Sacramento Kings":"Sacramento Kings","San Antonio Spurs":"San Antonio Spurs",
    "Toronto Raptors":"Toronto Raptors","Utah Jazz":"Utah Jazz","Washington Wizards":"Washington Wizards",
    # Nicknames -> Full
    "Celtics":"Boston Celtics","Nets":"Brooklyn Nets","Knicks":"New York Knicks","76ers":"Philadelphia 76ers","Sixers":"Philadelphia 76ers",
    "Raptors":"Toronto Raptors","Bulls":"Chicago Bulls","Cavaliers":"Cleveland Cavaliers","Cavs":"Cleveland Cavaliers",
    "Pistons":"Detroit Pistons","Pacers":"Indiana Pacers","Bucks":"Milwaukee Bucks","Hawks":"Atlanta Hawks","Hornets":"Charlotte Hornets",
    "Heat":"Miami Heat","Magic":"Orlando Magic","Wizards":"Washington Wizards",
    "Nuggets":"Denver Nuggets","Timberwolves":"Minnesota Timberwolves","Wolves":"Minnesota Timberwolves",
    "Jazz":"Utah Jazz","Thunder":"Oklahoma City Thunder","Trail Blazers":"Portland Trail Blazers","Blazers":"Portland Trail Blazers",
    "Warriors":"Golden State Warriors","Clippers":"Los Angeles Clippers","Lakers":"Los Angeles Lakers",
    "Suns":"Phoenix Suns","Kings":"Sacramento Kings","Mavericks":"Dallas Mavericks","Mavs":"Dallas Mavericks",
    "Grizzlies":"Memphis Grizzlies","Spurs":"San Antonio Spurs","Pelicans":"New Orleans Pelicans","Pels":"New Orleans Pelicans",
    "Rockets":"Houston Rockets",
}

def normalize_team(name: str) -> str:
    if not isinstance(name, str):
        return name
    n = name.strip()
    n = n.replace("LA Clippers", "Clippers").replace("LA Lakers", "Lakers")
    return TEAM_ALIASES.get(n, TEAM_ALIASES.get(n.title(), n))

# ---- Snapshot builders ----
def make_team_frame(df):
    home = df[["date","home_team","home_points","away_points"]].rename(
        columns={"home_team":"team","home_points":"points_for","away_points":"points_against"}
    ); home["is_home"] = 1
    away = df[["date","away_team","away_points","home_points"]].rename(
        columns={"away_team":"team","away_points":"points_for","home_points":"points_against"}
    ); away["is_home"] = 0
    team_df = pd.concat([home, away]).sort_values(["team","date"]).reset_index(drop=True)
    team_df["margin"] = team_df["points_for"] - team_df["points_against"]
    return team_df

def add_rolls(team_df, windows=[3,5]):
    grp = team_df.groupby("team", group_keys=False)
    team_df["games_played"] = grp.cumcount()
    for col in ["points_for","points_against","margin"]:
        team_df[f"{col}_exp"] = grp[col].transform(lambda s: s.shift(1).expanding().mean())
    for w in windows:
        team_df[f"pf_{w}g"] = grp["points_for"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        team_df[f"pa_{w}g"] = grp["points_against"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        team_df[f"margin_{w}g"] = grp["margin"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
    team_df["rest_days"] = grp["date"].diff().dt.days
    team_df["b2b"] = (team_df["rest_days"]==1).astype(float)
    return team_df

def latest_snapshot_before(snap_df, key_col, team, date_col, game_date):
    snap = snap_df[(snap_df[key_col]==team) & (snap_df[date_col] < game_date)].sort_values(date_col).tail(1)
    return snap

# ---- Pickle-safe calibrated XGB wrapper (for loading saved models) ----
class CalibratedXGB(BaseEstimator, ClassifierMixin):
    _estimator_type = "classifier"
    def __init__(self, base=None, platt=None): self.base, self.platt = base, platt
    def fit(self, X, y): return self
    def predict_proba(self, X):
        p = self.base.predict_proba(X)[:,1]
        pc = self.platt.predict_proba(p.reshape(-1,1))[:,1]
        return np.column_stack([1-pc, pc])
    def predict(self, X): return (self.predict_proba(X)[:,1] >= 0.5).astype(int)


In [None]:

# === 4) Load history & build/cache team snapshots (fast with parquet cache) ===
import pandas as pd, numpy as np

hist = pd.read_csv(HIST_CSV, parse_dates=["date"])
hist = hist.sort_values("date").reset_index(drop=True)

if os.path.exists(CACHE_TEAM_PARQUET):
    try:
        team_all = pd.read_parquet(CACHE_TEAM_PARQUET)
        # ensure types are right
        team_all["HACK"] = 1  # touch to validate
        team_all = team_all.drop(columns=["HACK"])
        print("Loaded team snapshots from cache:", CACHE_TEAM_PARQUET)
    except Exception as e:
        print("Cache read failed, rebuilding snapshots:", e)
        team_all = add_rolls(make_team_frame(hist), windows=[3,5])
        team_all.to_parquet(CACHE_TEAM_PARQUET, index=False)
        print("Cached snapshots to:", CACHE_TEAM_PARQUET)
else:
    team_all = add_rolls(make_team_frame(hist), windows=[3,5])
    try:
        team_all.to_parquet(CACHE_TEAM_PARQUET, index=False)
        print("Cached snapshots to:", CACHE_TEAM_PARQUET)
    except Exception as e:
        print("Parquet cache save skipped:", e)

H_all = team_all[team_all["is_home"]==1].add_prefix("H_")
A_all = team_all[team_all["is_home"]==0].add_prefix("A_")
print("Snapshots ready. H rows:", len(H_all), "A rows:", len(A_all))


In [None]:

# === 5) Fetch today's games + current lines (robust) ===
import pandas as pd, numpy as np, requests, pytz
from nba_api.live.nba.endpoints import scoreboard as live_scoreboard
from datetime import datetime, timezone, timedelta

REGIONS_TRY  = ["us", "us2", "uk", "eu", "au"]
BOOK_PREF    = ["fanduel", "draftkings", "betmgm", "caesars", "pointsbetus", "barstool"]
MARKETS      = "h2h,spreads"

def iso_sec(dtobj):
    return dtobj.astimezone(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

def today_et_date():
    et = pytz.timezone("US/Eastern")
    return datetime.now(et).date()

def odds_api(url, params):
    try:
        r = requests.get(url, params=params, timeout=15)
        rem = r.headers.get("x-requests-remaining")
        used = r.headers.get("x-requests-used")
        print(f"[odds-api] HTTP {r.status_code} | used={used} remaining={rem}")
        return r
    except Exception as e:
        print("[odds-api] Error:", e)
        return None

def pick_book(books, market):
    by_key = {b.get("key"): b for b in (books or [])}
    for bk in BOOK_PREF:
        b = by_key.get(bk)
        if not b: 
            continue
        for m in b.get("markets", []) or []:
            if m.get("key") == market:
                return bk, m
    for b in books or []:
        for m in b.get("markets", []) or []:
            if m.get("key") == market:
                return b.get("key"), m
    return None, None

def extract_moneylines(game_json):
    key, m = pick_book(game_json.get("bookmakers"), "h2h")
    if not m: return None, None
    out = m.get("outcomes", []) or []
    d = {normalize_team(o.get("name")): o.get("price") for o in out if "name" in o}
    return d, key

def extract_spreads(game_json):
    key, m = pick_book(game_json.get("bookmakers"), "spreads")
    if not m: return None, None
    out = m.get("outcomes", []) or []
    d = {normalize_team(o.get("name")): o.get("point") for o in out if "name" in o}
    return d, key

# 5a) Today's schedule
today = today_et_date()
print("Today (ET):", today)
sb = live_scoreboard.ScoreBoard()
games = sb.games.get_dict()

sched_rows = []
for g in games:
    hname = g["homeTeam"].get("teamName") or g["homeTeam"].get("name")
    aname = g["awayTeam"].get("teamName") or g["awayTeam"].get("name")
    h = normalize_team(hname); a = normalize_team(aname)
    sched_rows.append({"date": pd.to_datetime(today), "home_team": h, "away_team": a})

sched = pd.DataFrame(sched_rows).drop_duplicates()
if sched.empty:
    print("No NBA games found today via nba_api.")
    sched = pd.DataFrame(columns=["date","home_team","away_team"])

# 5b) Odds API (optional)
moneyline_map, spread_map = {}, {}
if ODDS_API_KEY:
    base = "https://api.the-odds-api.com/v4/sports/basketball_nba/odds"
    t0 = datetime.now(timezone.utc) - timedelta(hours=6)
    t1 = datetime.now(timezone.utc) + timedelta(days=2)
    t0s, t1s = iso_sec(t0), iso_sec(t1)

    got_any = False
    for region in REGIONS_TRY:
        print(f"\n[odds-api] Query region='{region}' window {t0s} -> {t1s}")
        params = {
            "apiKey": ODDS_API_KEY,
            "regions": region,
            "markets": MARKETS,
            "oddsFormat": "american",
            "dateFormat": "iso",
            "commenceTimeFrom": t0s,
            "commenceTimeTo":   t1s,
        }
        resp = odds_api(base, params)
        if resp is None: 
            continue
        if resp.status_code == 422:
            print("[odds-api] 422 invalid commenceTime*. Retrying WITHOUT window...")
            params2 = {
                "apiKey": ODDS_API_KEY,
                "regions": region,
                "markets": MARKETS,
                "oddsFormat": "american",
                "dateFormat": "iso",
            }
            resp = odds_api(base, params2)

        if resp.status_code == 200:
            data = resp.json()
            if isinstance(data, list) and len(data) > 0:
                print(f"[odds-api] Found {len(data)} events in region '{region}'.")
                got_any = True
                for game in data:
                    home = normalize_team(game.get("home_team"))
                    away = normalize_team(game.get("away_team"))
                    ml_dict, _ = extract_moneylines(game)
                    if ml_dict: moneyline_map[(home, away)] = ml_dict
                    sp_dict, _ = extract_spreads(game)
                    if sp_dict: spread_map[(home, away)] = sp_dict
                break
            else:
                print(f"[odds-api] 200 OK but no events returned for region '{region}'.")
        else:
            print(f"[odds-api] HTTP {resp.status_code}: {resp.text[:300]}")
else:
    print("No ODDS_API_KEY set; continuing without odds.")

# 5c) Merge schedule + odds and save fixtures CSV
def lookup_moneylines(h, a):
    d = moneyline_map.get((h,a)) or moneyline_map.get((a,h))
    if not d: return np.nan, np.nan
    return d.get(h, np.nan), d.get(a, np.nan)

def lookup_spread(h, a):
    d = spread_map.get((h,a)) or spread_map.get((a,h))
    if not d: return np.nan
    return d.get(h, np.nan)

if not sched.empty:
    sched["home_moneyline"], sched["away_moneyline"] = zip(*[
        lookup_moneylines(h, a) for h, a in zip(sched["home_team"], sched["away_team"])
    ])
    sched["spread_close"] = [
        lookup_spread(h, a) for h, a in zip(sched["home_team"], sched["away_team"])
    ]

for c in ["home_moneyline","away_moneyline","spread_close"]:
    if c in sched.columns:
        sched[c] = pd.to_numeric(sched[c], errors="coerce")

sched = sched.sort_values(["date","home_team"]).reset_index(drop=True)
sched.to_csv(OUT_FIXTURES_CSV, index=False)
print(f"\nSaved {len(sched)} games to {OUT_FIXTURES_CSV}")
display(sched.head(30))


In [None]:

# === 6) Load models & feature lists ===
from sklearn.base import BaseEstimator, ClassifierMixin

def load_models_safe(paths):
    try:
        return [joblib.load(p) for p in paths]
    except Exception as e:
        print("Model load error:", e)
        return [None, None, None]

def load_featlist_safe(path):
    try:
        s = pd.read_csv(path, header=None).iloc[:,0].astype(str).str.strip().tolist()
        return [c for c in s if c not in {"", "0", "Unnamed: 0"}]
    except Exception as e:
        print("Feature list load error:", e)
        return None

team_lr, team_rf, team_xgb = load_models_safe([TEAM_LR_PATH, TEAM_RF_PATH, TEAM_XGB_PATH])
team_feats = load_featlist_safe(TEAM_FEATS_CSV)
print(f"Team-only models loaded: {all(m is not None for m in [team_lr,team_rf,team_xgb])}, features: {len(team_feats) if team_feats else 0}")

lines_lr, lines_rf, lines_xgb = load_models_safe([LINES_LR_PATH, LINES_RF_PATH, LINES_XGB_PATH])
lines_feats = load_featlist_safe(LINES_FEATS_CSV)
print(f"With-lines models loaded: {all(m is not None for m in [lines_lr,lines_rf,lines_xgb])}, features: {len(lines_feats) if lines_feats else 0}")


In [None]:

# === 7) Final Predictions (Team-only + With-lines + Hybrid) ===
import numpy as np, pandas as pd

# Build per-game base with full H_/A_ snapshots
fx = pd.read_csv(OUT_FIXTURES_CSV, parse_dates=["date"])
rows, missing_hist = [], set()

for _, r in fx.iterrows():
    d, h, a = r["date"], str(r["home_team"]), str(r["away_team"])
    H_snap = latest_snapshot_before(H_all, "H_team", h, "H_date", d)
    A_snap = latest_snapshot_before(A_all, "A_team", a, "A_date", d)
    if H_snap.empty:
        missing_hist.add(h); H_snap = pd.DataFrame([{c: np.nan for c in H_all.columns}]); H_snap["H_team"]=h; H_snap["H_date"]=pd.NaT
    if A_snap.empty:
        missing_hist.add(a); A_snap = pd.DataFrame([{c: np.nan for c in A_all.columns}]); A_snap["A_team"]=a; A_snap["A_date"]=pd.NaT

    base = pd.DataFrame([{"date": d, "home_team": h, "away_team": a}])
    merged = (base
              .merge(H_snap, how="left", left_on="home_team", right_on="H_team")
              .merge(A_snap, how="left", left_on="away_team", right_on="A_team"))
    merged["rest_diff"]   = merged["H_rest_days"] - merged["A_rest_days"]
    merged["b2b_diff"]    = merged["H_b2b"] - merged["A_b2b"]
    merged["home_on_b2b"] = merged["H_b2b"]
    merged["away_on_b2b"] = merged["A_b2b"]
    # pass market fields for reporting and with-lines model
    for c in ["home_moneyline","away_moneyline","spread_close"]:
        if c in fx.columns: merged[c] = r.get(c, np.nan)
    rows.append(merged)

pred_base = pd.concat(rows, ignore_index=True)
if missing_hist:
    print(f"Note: missing recent history for {len(missing_hist)} team(s): {sorted(missing_hist)} (features will be imputed).")

# Precompute market probs for reporting
if "home_moneyline" in pred_base.columns and "away_moneyline" in pred_base.columns:
    pred_base["home_imp_raw"] = implied_prob(pred_base["home_moneyline"])
    pred_base["away_imp_raw"] = implied_prob(pred_base["away_moneyline"])
    s = pred_base["home_imp_raw"] + pred_base["away_imp_raw"]
    pred_base["market_prob_home"] = pred_base["home_imp_raw"] / s
else:
    pred_base["market_prob_home"] = np.nan

# ---- TEAM-ONLY predictions ----
team_out = None
if team_feats and all(m is not None for m in [team_lr,team_rf,team_xgb]):
    Xt = pred_base.copy()
    for c in team_feats:
        if c not in Xt.columns: Xt[c] = np.nan
    Xt = Xt[team_feats].replace([np.inf,-np.inf], np.nan)

    p_lr  = team_lr.predict_proba(Xt)[:,1]
    p_rf  = team_rf.predict_proba(Xt)[:,1]
    p_xgb = team_xgb.predict_proba(Xt)[:,1]
    p_ens = np.mean([p_lr, p_rf, p_xgb], axis=0)

    pred_home = (p_ens >= TEAM_THRESHOLD).astype(int)
    conf_team = np.where(pred_home==1, p_ens, 1 - p_ens)

    team_out = pred_base[["date","home_team","away_team"]].copy()
    team_out["prob_home_ens_team"] = p_ens
    team_out["predicted_winner_team"] = np.where(pred_home==1, team_out["home_team"], team_out["away_team"])
    team_out["confidence_team"] = conf_team
    if "home_moneyline" in pred_base.columns:
        team_out["winner_moneyline_team"] = np.where(pred_home==1, pred_base["home_moneyline"], pred_base["away_moneyline"])
    team_out["market_prob_home"] = pred_base["market_prob_home"].values
    team_out = team_out.sort_values(["date","home_team"]).reset_index(drop=True)
    team_out.to_csv(OUT_TEAM_CSV, index=False)
    print(f"\nSaved TEAM-ONLY predictions -> {OUT_TEAM_CSV}")
    display(team_out)

# ---- WITH-LINES predictions ----
lines_out = None
if lines_feats and all(m is not None for m in [lines_lr,lines_rf,lines_xgb]):
    Xl = pred_base.copy()
    for c in lines_feats:
        if c not in Xl.columns: Xl[c] = np.nan
    Xl = Xl[lines_feats].replace([np.inf,-np.inf], np.nan)

    pl_lr  = lines_lr.predict_proba(Xl)[:,1]
    pl_rf  = lines_rf.predict_proba(Xl)[:,1]
    pl_xgb = lines_xgb.predict_proba(Xl)[:,1]
    p_ensl = np.mean([pl_lr, pl_rf, pl_xgb], axis=0)

    pred_home_l = (p_ensl >= 0.5).astype(int)
    conf_lines = np.where(pred_home_l==1, p_ensl, 1 - p_ensl)

    lines_out = pred_base[["date","home_team","away_team"]].copy()
    lines_out["prob_home_ens_lines"] = p_ensl
    lines_out["predicted_winner_lines"] = np.where(pred_home_l==1, lines_out["home_team"], lines_out["away_team"])
    lines_out["confidence_lines"] = conf_lines
    for c in ["home_moneyline","away_moneyline"]:
        if c in pred_base.columns: lines_out[c] = pred_base[c].values
    lines_out["winner_moneyline_lines"] = np.where(pred_home_l==1, lines_out["home_moneyline"], lines_out["away_moneyline"])
    lines_out["market_prob_home"] = pred_base["market_prob_home"].values
    lines_out = lines_out.sort_values(["date","home_team"]).reset_index(drop=True)
    lines_out.to_csv(OUT_LINES_CSV, index=False)
    print(f"\nSaved WITH-LINES predictions -> {OUT_LINES_CSV}")
    display(lines_out)

# ---- HYBRID blend ----
if (team_out is not None) and (lines_out is not None):
    hybrid = pred_base[["date","home_team","away_team"]].copy()
    hybrid["prob_home_team"]  = team_out["prob_home_ens_team"].values
    hybrid["prob_home_lines"] = lines_out["prob_home_ens_lines"].values
    hybrid["prob_home_hybrid"] = (1 - HYBRID_ALPHA)*hybrid["prob_home_team"] + HYBRID_ALPHA*hybrid["prob_home_lines"]
    pred_home_h = (hybrid["prob_home_hybrid"] >= 0.5).astype(int)
    hybrid["predicted_winner_hybrid"] = np.where(pred_home_h==1, pred_base["home_team"], pred_base["away_team"])
    hybrid["confidence_hybrid"] = np.where(pred_home_h==1, hybrid["prob_home_hybrid"], 1 - hybrid["prob_home_hybrid"])
    for c in ["home_moneyline","away_moneyline"]:
        if c in pred_base.columns: hybrid[c] = pred_base[c].values
    hybrid["winner_moneyline_hybrid"] = np.where(pred_home_h==1, hybrid["home_moneyline"], hybrid["away_moneyline"])
    hybrid = hybrid.sort_values(["date","home_team"]).reset_index(drop=True)
    hybrid.to_csv(OUT_HYBRID_CSV, index=False)
    print(f"\nSaved HYBRID predictions -> {OUT_HYBRID_CSV}")
    display(hybrid)
elif team_out is not None:
    print("\nHybrid skipped (with-lines models unavailable).")
elif lines_out is not None:
    print("\nHybrid skipped (team-only models unavailable).")
else:
    print("\nNo models available to predict.")
