# NFL Modeling with Betting Lines: nflfastR + RBSDM + The Odds API

This notebook extends the end-to-end pipeline to:  
- Export weekly predictions to CSV.  
- Pull live betting lines from **The Odds API** for `americanfootball_nfl`, compute implied probabilities, and compare to model outputs.  

Two spaces after periods per your preference.  Hyphens instead of em dashes.  

**Prereqs:** `pip install nfl_data_py pandas requests scikit-learn matplotlib` and an Odds API key in `ODDS_API_KEY`.  


## 1. Setup and installs

In [None]:
# If running locally, uncomment as needed.
# %pip install -U pandas numpy scikit-learn matplotlib requests nfl_data_py


## 2. Imports and configuration

In [None]:
import os, io, json, time, math, requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, brier_score_loss, accuracy_score
from sklearn.inspection import permutation_importance

from nfl_data_py import import_pbp_data, import_schedules

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)
plt.rcParams["figure.figsize"] = (8,6)
plt.rcParams["axes.grid"] = True


## 3. Parameters

In [None]:
SEASONS = list(range(2019, 2025))
PREDICT_SEASON = 2024
PREDICT_WEEK = 10

RBSDM_URL = "https://rbsdm.com/stats/stats.csv"

# Paths
PRED_DIR = "./data/weekly_predictions"
os.makedirs(PRED_DIR, exist_ok=True)

# The Odds API
ODDS_API_KEY = os.environ.get("ODDS_API_KEY", None)  # set in your shell: export ODDS_API_KEY=...
ODDS_SPORT = "americanfootball_nfl"  # The Odds API sport key
ODDS_REGION = "us"                   # regions: us, uk, eu, au
ODDS_MARKETS = "h2h,spreads,totals"  # choose what to pull
ODDS_ODDSFORMAT = "american"         # american, decimal


## 4. Load data and engineer features

In [None]:
# Load pbp and schedules
pbp = import_pbp_data(SEASONS)
pbp = pbp.loc[pbp["play_type"].notna()].copy()

sched = import_schedules(SEASONS)
sched = sched.assign(home_win = (sched["result"] > 0).astype(int))
sched_small = sched[[
    "game_id","season","week","game_type","gameday",
    "home_team","away_team","home_score","away_score","home_win"
]].copy()
sched_small = sched_small.loc[sched_small["game_type"] == "REG"].reset_index(drop=True)

# Offense aggregates
off_agg = (
    pbp.groupby(["season","week","posteam"], as_index=False)
       .agg(
           off_plays=("play_id","count"),
           off_epa_mean=("epa","mean"),
           off_success=("success","mean"),
           off_yards_gained=("yards_gained","mean"),
           off_pass_rate=("pass","mean"),
           off_rush_rate=("rush","mean")
       ).rename(columns={"posteam":"team"})
)

# Defense aggregates
def_agg = (
    pbp.groupby(["season","week","defteam"], as_index=False)
       .agg(
           def_plays=("play_id","count"),
           def_epa_mean=("epa","mean"),
           def_success=("success","mean"),
           def_yards_gained=("yards_gained","mean"),
           def_pass_rate=("pass","mean"),
           def_rush_rate=("rush","mean")
       ).rename(columns={"defteam":"team"})
)

team_week = pd.merge(off_agg, def_agg, on=["season","week","team"], how="outer")

def add_group_rolls(df, group_col="team"):
    df = df.sort_values(["season","week"]).copy()
    df["season_week_index"] = (df["season"] - df["season"].min()) * 18 + df["week"]
    df = df.sort_values([group_col, "season_week_index"])
    num_cols = [c for c in df.columns if c not in [group_col,"season","week","season_week_index"]]
    for col in num_cols:
        df[f"{col}_lag1"]  = df.groupby(group_col)[col].shift(1)
        df[f"{col}_roll3"] = df.groupby(group_col)[col].shift(1).rolling(3).mean()
        df[f"{col}_roll5"] = df.groupby(group_col)[col].shift(1).rolling(5).mean()
    return df

team_week = add_group_rolls(team_week, "team")
lag_cols = [c for c in team_week.columns if c.endswith(("_lag1","_roll3","_roll5"))]
team_week_lagged = team_week[["season","week","team"] + lag_cols].copy()


## 5. RBSDM join

In [None]:
import requests
r = requests.get(RBSDM_URL, timeout=30)
r.raise_for_status()
import io
rbsdm = pd.read_csv(io.StringIO(r.text))

possible_cols = [c for c in rbsdm.columns if c.lower() in {
    "team","season","off_epa","def_epa","off_success","def_success","off_pass_epa","off_rush_epa","def_pass_epa","def_rush_epa"
}]
if "team" not in possible_cols:
    possible_cols = ["team","season","off_epa","def_epa","off_success","def_success","off_pass_epa","off_rush_epa","def_pass_epa","def_rush_epa"]
rbsdm_small = rbsdm[[c for c in possible_cols if c in rbsdm.columns]].copy()
if "team" in rbsdm_small.columns:
    rbsdm_small = rbsdm_small.rename(columns={"team":"team_full"})

abbr_to_full = {
    "ARI":"Arizona Cardinals","ATL":"Atlanta Falcons","BAL":"Baltimore Ravens","BUF":"Buffalo Bills",
    "CAR":"Carolina Panthers","CHI":"Chicago Bears","CIN":"Cincinnati Bengals","CLE":"Cleveland Browns",
    "DAL":"Dallas Cowboys","DEN":"Denver Broncos","DET":"Detroit Lions","GB":"Green Bay Packers",
    "HOU":"Houston Texans","IND":"Indianapolis Colts","JAX":"Jacksonville Jaguars","KC":"Kansas City Chiefs",
    "LV":"Las Vegas Raiders","LAC":"Los Angeles Chargers","LAR":"Los Angeles Rams","MIA":"Miami Dolphins",
    "MIN":"Minnesota Vikings","NE":"New England Patriots","NO":"New Orleans Saints","NYG":"New York Giants",
    "NYJ":"New York Jets","PHI":"Philadelphia Eagles","PIT":"Pittsburgh Steelers","SEA":"Seattle Seahawks",
    "SF":"San Francisco 49ers","TB":"Tampa Bay Buccaneers","TEN":"Tennessee Titans","WAS":"Washington Commanders"
}
teams_map = pd.DataFrame({"team": list(abbr_to_full.keys()), "team_full": list(abbr_to_full.values())})

team_week_enriched = team_week_lagged.merge(teams_map, on="team", how="left").merge(
    rbsdm_small, on=["team_full","season"], how="left"
)


## 6. Game-level assembly and model training

In [None]:
def prep_game_table(sched_small, team_week_enriched):
    # Prefix features per side
    def join_side(prefix):
        side = team_week_enriched.copy()
        side = side.rename(columns={"team": f"{prefix}_team"})
        # Prefix all except keys
        cols = [c for c in side.columns if c not in ["season","week",f"{prefix}_team","team_full"]]
        side = side.rename(columns={c: f"{prefix}_{c}" for c in cols})
        return side

    home = join_side("home")
    away = join_side("away")

    games = sched_small.merge(
        home, left_on=["season","week","home_team"], right_on=["season","week","home_team"], how="left"
    ).merge(
        away, left_on=["season","week","away_team"], right_on=["season","week","away_team"], how="left"
    )

    games["y_home_win"] = games["home_win"].astype(int)

    feat_cols = [c for c in games.columns if any(c.startswith(p) for p in ["home_","away_"])]
    X = games[feat_cols].select_dtypes(include=[np.number]).copy()
    y = games["y_home_win"].copy()

    # keep columns that are present in most rows
    min_non_na = int(0.8 * len(X))
    keep_cols = [c for c in X.columns if X[c].notna().sum() >= min_non_na]
    X = X[keep_cols]
    mask = X.notna().all(axis=1).values
    return games[mask].reset_index(drop=True), X[mask], y[mask], keep_cols

games_valid, X, y, keep_cols = prep_game_table(sched_small, team_week_enriched)

last_season = max(SEASONS)
train_mask = games_valid["season"] < last_season
test_mask  = games_valid["season"] == last_season

X_train, y_train = X[train_mask], y[train_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

logit = LogisticRegression(max_iter=250, solver="lbfgs")
logit.fit(X_train_scaled, y_train)
logit_proba = logit.predict_proba(X_test_scaled)[:,1]

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
gb_proba = gb.predict_proba(X_test)[:,1]

def eval_metrics(y_true, y_pred_proba, threshold=0.5, label="model"):
    from sklearn.metrics import roc_auc_score, brier_score_loss, accuracy_score
    auc  = roc_auc_score(y_true, y_pred_proba)
    brier = brier_score_loss(y_true, y_pred_proba)
    y_pred = (y_pred_proba >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    return {"label": label, "AUC": auc, "Brier": brier, "Accuracy@0.5": acc}

results = pd.DataFrame([
    eval_metrics(y_test, logit_proba, label="Logistic Regression"),
    eval_metrics(y_test, gb_proba,   label="Gradient Boosting")
])
results


## 7. Weekly predictions and CSV export

In [None]:
def predict_week(games_valid, X_cols, scaler, logit, gb, season, week, out_dir=PRED_DIR):
    g = games_valid[(games_valid["season"] == season) & (games_valid["week"] == week)].copy()
    if g.empty:
        print("No games found for that season and week.")
        return pd.DataFrame()

    X_week = g[X_cols].copy()
    X_week_scaled = scaler.transform(X_week)

    g["proba_home_logit"] = logit.predict_proba(X_week_scaled)[:,1]
    g["proba_home_gb"]    = gb.predict_proba(X_week)[:,1]
    g["pick_logit_home"]  = (g["proba_home_logit"] >= 0.5).astype(int)
    g["pick_gb_home"]     = (g["proba_home_gb"] >= 0.5).astype(int)

    cols = ["season","week","gameday","home_team","away_team","home_win",
            "proba_home_logit","proba_home_gb","pick_logit_home","pick_gb_home"]
    out = g[cols].sort_values(["week","home_team"]).reset_index(drop=True)

    # Save CSV
    fname = f"{out_dir}/predictions_{season}_wk{week}.csv"
    out.to_csv(fname, index=False)
    print(f"Saved predictions to {fname}")
    return out

preds_this_week = predict_week(games_valid, keep_cols, scaler, logit, gb, PREDICT_SEASON, PREDICT_WEEK, PRED_DIR)
preds_this_week.head()


## 8. Odds helper functions

In [None]:
# Implied probabilities from moneylines.  American odds to implied probability without vigorish removal.
def moneyline_to_prob(ml):
    if ml is None or pd.isna(ml):
        return np.nan
    ml = float(ml)
    if ml > 0:
        return 100.0 / (ml + 100.0)
    else:
        return -ml / (-ml + 100.0)

# Remove vig to normalize to 1.0.  If both sides given.
def normalize_two_way(p_home, p_away):
    if np.isnan(p_home) or np.isnan(p_away):
        return p_home, p_away
    s = p_home + p_away
    if s <= 0:
        return p_home, p_away
    return p_home / s, p_away / s

# Light team name harmonization for The Odds API vs nflfastR abbreviations
ODDS_TEAM_NAME_TO_ABBR = {
    "Arizona Cardinals":"ARI","Atlanta Falcons":"ATL","Baltimore Ravens":"BAL","Buffalo Bills":"BUF",
    "Carolina Panthers":"CAR","Chicago Bears":"CHI","Cincinnati Bengals":"CIN","Cleveland Browns":"CLE",
    "Dallas Cowboys":"DAL","Denver Broncos":"DEN","Detroit Lions":"DET","Green Bay Packers":"GB",
    "Houston Texans":"HOU","Indianapolis Colts":"IND","Jacksonville Jaguars":"JAX","Kansas City Chiefs":"KC",
    "Las Vegas Raiders":"LV","Los Angeles Chargers":"LAC","Los Angeles Rams":"LAR","Miami Dolphins":"MIA",
    "Minnesota Vikings":"MIN","New England Patriots":"NE","New Orleans Saints":"NO","New York Giants":"NYG",
    "New York Jets":"NYJ","Philadelphia Eagles":"PHI","Pittsburgh Steelers":"PIT","Seattle Seahawks":"SEA",
    "San Francisco 49ers":"SF","Tampa Bay Buccaneers":"TB","Tennessee Titans":"TEN","Washington Commanders":"WAS"
}


## 9. Pull The Odds API lines for a date range or week

In [None]:
def fetch_odds_oddsapi(api_key, sport=ODDS_SPORT, regions=ODDS_REGION, markets=ODDS_MARKETS, odds_format=ODDS_ODDSFORMAT):
    if not api_key:
        raise RuntimeError("ODDS_API_KEY not set.  Set env var ODDS_API_KEY before running.")
    url = f"https://api.the-odds-api.com/v4/sports/{sport}/odds/"
    params = {
        "regions": regions,
        "markets": markets,
        "oddsFormat": odds_format,
        "apiKey": api_key
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    return data

def odds_to_frame(data):
    # Flatten The Odds API response to a reasonable DataFrame with best available prices per book.
    rows = []
    for game in data:
        commence_time = game.get("commence_time")
        home = game.get("home_team")
        away = game.get("away_team")
        # pick best moneyline across bookmakers for both sides
        ml_home, ml_away = np.nan, np.nan
        spread_home, spread_away = np.nan, np.nan
        total_points = np.nan

        for bm in game.get("bookmakers", []):
            for market in bm.get("markets", []):
                mk = market.get("key")
                outcomes = market.get("outcomes", [])
                if mk == "h2h":
                    # moneyline
                    for o in outcomes:
                        if o.get("name") == home:
                            ml_home = np.nanmax([ml_home, o.get("price")])
                        elif o.get("name") == away:
                            ml_away = np.nanmax([ml_away, o.get("price")])
                elif mk == "spreads":
                    # take home line where outcome name == home
                    for o in outcomes:
                        if o.get("name") == home:
                            spread_home = o.get("point")
                        elif o.get("name") == away:
                            spread_away = o.get("point")
                elif mk == "totals":
                    # just capture a representative total
                    if outcomes:
                        total_points = outcomes[0].get("point", total_points)

        rows.append({
            "commence_time": commence_time,
            "home_name": home,
            "away_name": away,
            "ml_home": ml_home,
            "ml_away": ml_away,
            "spread_home": spread_home,
            "spread_away": spread_away,
            "total_points": total_points
        })
    return pd.DataFrame(rows)

# Example fetch.  Comment out if you prefer to skip during dev.
if ODDS_API_KEY:
    raw = fetch_odds_oddsapi(ODDS_API_KEY)
    odds_df = odds_to_frame(raw)
    # Map names to abbreviations
    odds_df["home_team"] = odds_df["home_name"].map(ODDS_TEAM_NAME_TO_ABBR)
    odds_df["away_team"] = odds_df["away_name"].map(ODDS_TEAM_NAME_TO_ABBR)
    print(f"Pulled {len(odds_df)} games from The Odds API.")
    display(odds_df.head())
else:
    print("ODDS_API_KEY not found in environment.  Set it to enable live odds fetching.")


## 10. Join odds to predictions and compute edges

In [None]:
def join_odds_to_preds(preds_df, odds_df, season, week):
    # Merge on home_team and away_team.  If multiple rows per matchup in odds, take the first.
    o = odds_df.drop_duplicates(subset=["home_team","away_team"]).copy()

    out = preds_df.merge(
        o[["home_team","away_team","ml_home","ml_away","spread_home","spread_away","total_points"]],
        on=["home_team","away_team"],
        how="left"
    )

    # Implied probabilities from moneylines
    out["impl_home"] = out["ml_home"].apply(moneyline_to_prob)
    out["impl_away"] = out["ml_away"].apply(moneyline_to_prob)
    out["impl_home_norm"], out["impl_away_norm"] = zip(*out.apply(lambda r: normalize_two_way(r["impl_home"], r["impl_away"]), axis=1))

    # Edge vs model probabilities
    out["edge_logit_home"] = out["proba_home_logit"] - out["impl_home_norm"]
    out["edge_gb_home"]    = out["proba_home_gb"]    - out["impl_home_norm"]

    # Save
    fname = f"{PRED_DIR}/predictions_with_odds_{season}_wk{week}.csv"
    out.to_csv(fname, index=False)
    print(f"Saved predictions with odds to {fname}")
    return out

if ODDS_API_KEY:
    preds = preds_this_week if 'preds_this_week' in globals() else None
    if preds is None or preds.empty:
        preds = predict_week(games_valid, keep_cols, scaler, logit, gb, PREDICT_SEASON, PREDICT_WEEK, PRED_DIR)
    joined = join_odds_to_preds(preds, odds_df, PREDICT_SEASON, PREDICT_WEEK)
    display(joined.sort_values("edge_gb_home", ascending=False).head(10))
else:
    print("Skipping join since ODDS_API_KEY not set.")


## 11. Quick visualization: model vs implied

In [None]:
if ODDS_API_KEY and 'joined' in globals():
    plt.figure()
    plt.scatter(joined["impl_home_norm"], joined["proba_home_gb"], alpha=0.7)
    plt.xlabel("Implied probability home (normalized)")
    plt.ylabel("Model probability home - Gradient Boosting")
    plt.title("Market implied vs model probability - home side")
    plt.tight_layout()
    plt.show()
else:
    print("No joined data to plot.  Ensure ODDS_API_KEY is set and odds were fetched.")


## 12. Notes

- The Odds API has per-minute rate limits and daily quotas.  Cache your results locally for repeat runs.  
- Moneyline implied probabilities are normalized here to remove book vig.  
- Mapping between Odds API team names and nflfastR abbreviations is manual.  Adjust if the provider updates naming.  
- Consider incorporating closing spreads and totals as model features for calibration improvements.  
