# Kelly sizing, bankroll simulation, and historical ROI backtest

This notebook extends your NFL modeling workflow with stake sizing and backtesting.  It adds:
- Kelly Criterion position sizing from model probabilities and American odds.  
- Bankroll simulation with configurable fractional Kelly and edge thresholds.  
- Historical ROI backtest hooks that join predictions to a table of closing moneylines.  
- Simple plots for bankroll growth and edge distribution.  

Two spaces after periods.  Hyphens instead of em dashes.  


## 1. Setup

In [None]:
# If needed:
# %pip install -U pandas numpy matplotlib
import os, math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Paths - adjust to your project layout
PRED_DIR = "./data/weekly_predictions"   # where the previous notebook writes weekly CSVs
HIST_LINES_CSV = "./data/lines_historical.csv"  # user-provided historical closing lines
os.makedirs("./data", exist_ok=True)


## 2. Helpers - odds, implied probabilities, and kelly sizing

In [None]:
def moneyline_to_prob(ml):
    if ml is None or pd.isna(ml):
        return np.nan
    ml = float(ml)
    if ml > 0:
        return 100.0 / (ml + 100.0)
    else:
        return -ml / (-ml + 100.0)

def american_to_decimal(ml):
    ml = float(ml)
    if ml > 0:
        return 1.0 + ml / 100.0
    else:
        return 1.0 + 100.0 / abs(ml)

def kelly_fraction(p, ml):
    """Kelly fraction for a binary bet using American moneyline.
    p is model probability of the event you are backing.
    ml is the American odds for that outcome.
    Returns fraction of bankroll to wager.  Negative means no bet.
    """
    if pd.isna(p) or pd.isna(ml):
        return np.nan
    b = american_to_decimal(ml) - 1.0  # net odds
    q = 1.0 - p
    k = (b * p - q) / b
    return k

def fractional_kelly(p, ml, fraction=0.5):
    k = kelly_fraction(p, ml)
    return max(0.0, fraction * k) if not pd.isna(k) else np.nan


## 3. Load weekly predictions and assemble a season-long table

In [None]:
def load_all_weekly_predictions(pred_dir=PRED_DIR):
    files = []
    for f in os.listdir(pred_dir):
        if f.endswith(".csv") and f.startswith("predictions_"):
            files.append(os.path.join(pred_dir, f))
    if not files:
        raise FileNotFoundError("No prediction CSVs found.  Run the modeling notebook to generate weekly predictions.")
    frames = [pd.read_csv(fp) for fp in sorted(files)]
    df = pd.concat(frames, ignore_index=True)
    return df

preds = load_all_weekly_predictions(PRED_DIR)
print(f"Loaded {len(preds)} prediction rows from {PRED_DIR}.")
preds.head()


## 4. Load historical closing lines and join to predictions

In [None]:
# Expect schema:
# season,week,home_team,away_team,ml_home,ml_away,closing_flag
# closing_flag == 1 if row is the closing line.  Multiple books may exist - keep one row per matchup.
def load_hist_lines(path=HIST_LINES_CSV):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Historical lines CSV not found at {path}.  Use the provided template to create one.")
    df = pd.read_csv(path)
    required = {"season","week","home_team","away_team","ml_home","ml_away"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns in historical lines: {missing}")
    # Deduplicate if multiple rows per matchup
    df = df.drop_duplicates(subset=["season","week","home_team","away_team"])
    return df

hist = load_hist_lines(HIST_LINES_CSV)
joined = preds.merge(hist, on=["season","week","home_team","away_team"], how="inner")
print(f"Joined rows: {len(joined)}")
joined.head()


## 5. Compute implied probabilities and edges vs model

In [None]:
# Implied probabilities for home moneyline
joined["impl_home"] = joined["ml_home"].apply(moneyline_to_prob)
joined["impl_away"] = joined["ml_away"].apply(moneyline_to_prob)

# Normalize two-way to remove vig if both sides exist
s = joined["impl_home"] + joined["impl_away"]
joined["impl_home_norm"] = joined["impl_home"] / s
joined["impl_away_norm"] = joined["impl_away"] / s

# Choose a model - default Gradient Boosting from your earlier notebook
joined["p_model_home"] = joined["proba_home_gb"]
joined["edge_home"] = joined["p_model_home"] - joined["impl_home_norm"]

joined[["season","week","home_team","away_team","p_model_home","impl_home_norm","edge_home"]].head()


## 6. Bankroll simulation with fractional Kelly

In [None]:
def simulate_bankroll(df, kelly_fraction_use=0.5, min_edge=0.02, max_fraction=0.02, starting_bankroll=10000.0, bet_on="home"):
    """Simulate bankroll over time using fractional Kelly on chosen side.
    df must include: p_model_home, ml_home, home_win, season, week, home_team, away_team
    bet_on: "home" places bets when p_model_home - implied > min_edge.
    Returns a copy with stakes and pnl, and the bankroll curve.
    """
    out = df.copy().sort_values(["season","week","home_team","away_team"]).reset_index(drop=True)
    bankroll = starting_bankroll
    curve = [bankroll]
    stakes = []
    pnls = []

    for _, r in out.iterrows():
        p = r["p_model_home"]
        ml = r["ml_home"]
        edge = r["edge_home"]

        # Decide whether to bet
        if pd.isna(p) or pd.isna(ml) or pd.isna(edge) or edge < min_edge:
            stakes.append(0.0)
            pnls.append(0.0)
            curve.append(bankroll)
            continue

        f_kelly = fractional_kelly(p, ml, fraction=kelly_fraction_use)
        f_kelly = min(max(f_kelly, 0.0), max_fraction)  # cap bet size

        stake = bankroll * f_kelly
        stakes.append(stake)

        # Resolve PnL
        if r["home_win"] == 1:
            ret = stake * (american_to_decimal(ml) - 1.0)
        else:
            ret = -stake

        bankroll += ret
        pnls.append(ret)
        curve.append(bankroll)

    out["stake"] = stakes
    out["pnl"] = pnls
    out["bankroll_after"] = curve[1:]
    return out, curve

sim, curve = simulate_bankroll(joined, kelly_fraction_use=0.5, min_edge=0.02, max_fraction=0.02, starting_bankroll=10000.0)
print(f"Final bankroll: {curve[-1]:.2f}")
sim.head()


## 7. Plots - bankroll curve and edge histogram

In [None]:
# Bankroll curve
plt.figure()
plt.plot(curve)
plt.title("Bankroll over time - fractional Kelly")
plt.xlabel("Bet index")
plt.ylabel("Bankroll")
plt.tight_layout()
plt.show()

# Edge histogram
plt.figure()
valid_edges = sim["edge_home"].dropna()
plt.hist(valid_edges, bins=30)
plt.title("Edge distribution - model probability minus implied")
plt.xlabel("Edge")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


## 8. Backtest metrics

In [None]:
def backtest_metrics(sim):
    total_bets = (sim["stake"] > 0).sum()
    roi = sim["pnl"].sum() / sim["stake"].sum() if sim["stake"].sum() > 0 else np.nan
    hit_rate = (sim.loc[sim["stake"] > 0, "home_win"] == 1).mean() if total_bets > 0 else np.nan
    max_dd = (sim["bankroll_after"].cummax() - sim["bankroll_after"]).max()
    return {
        "total_bets": int(total_bets),
        "roi": roi,
        "hit_rate": hit_rate,
        "final_bankroll": float(sim["bankroll_after"].iloc[-1]) if len(sim) else np.nan,
        "max_drawdown": float(max_dd) if not pd.isna(max_dd) else np.nan
    }

metrics = backtest_metrics(sim)
metrics


## 9. Save enriched backtest results

In [None]:
out_path = "./data/backtest_results.csv"
sim.to_csv(out_path, index=False)
print(f"Saved backtest results to {out_path}")
