# Baseline xG (Goal vs Non-Goal) — Skeleton

We'll load ../data/xgoal-db.sqlite if present, build simple geometry/context features, train a calibrated Logistic Regression, and produce basic metrics + explanations. Non-penalty, non-own-goal only. Everything is guarded so the notebook runs even when the DB is missing.


In [None]:
# Core
import os, sqlite3, math, json
from pathlib import Path

# Data
import numpy as np
import pandas as pd

# Modeling
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss

# Plots
import matplotlib.pyplot as plt

# Repro
SEED = 42
np.random.seed(SEED)

# ---- Config ----
DB_PATH = Path("../data/xgoal-db.sqlite")   # <— YOUR DB path
USE_MATERIALIZED_WIDE = True                # if table shots_wide exists, use it
PRINT_ROWS = 5


In [None]:
WIDE_CTE_SQL = """
WITH ff AS (
    SELECT
        shot_id,
        COUNT(*)                                                AS ff_count,
        SUM(CASE WHEN teammate = 1 THEN 1 ELSE 0 END)          AS ff_teammates,
        SUM(CASE WHEN teammate = 0 THEN 1 ELSE 0 END)          AS ff_opponents,
        SUM(CASE WHEN keeper   = 1 THEN 1 ELSE 0 END)          AS ff_keeper_count,
        AVG(CASE WHEN keeper   = 1 THEN x END)                 AS ff_keeper_x,
        AVG(CASE WHEN keeper   = 1 THEN y END)                 AS ff_keeper_y
    FROM freeze_frames
    GROUP BY shot_id
)
SELECT
    s.*,
    e.under_pressure       AS event_under_pressure,
    e.counterpress         AS event_counterpress,
    e.duration             AS event_duration,
    ff.ff_count,
    ff.ff_teammates,
    ff.ff_opponents,
    ff.ff_keeper_count,
    ff.ff_keeper_x,
    ff.ff_keeper_y
FROM shots s
LEFT JOIN events e ON e.event_id = s.shot_id
LEFT JOIN ff     ON ff.shot_id   = s.shot_id;
"""

def load_wide_df(db_path: Path, use_materialized: bool = True) -> pd.DataFrame | None:
    if not db_path.exists():
        print(f"[info] DB not found at {db_path.resolve()}. Proceeding with skeleton only.")
        return None
    with sqlite3.connect(db_path) as conn:
        tables = set(pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)["name"])
        if "shots_wide" in tables and use_materialized:
            df = pd.read_sql("SELECT * FROM shots_wide;", conn)
        else:
            df = pd.read_sql(WIDE_CTE_SQL, conn)
    if not df.empty and not df["shot_id"].is_unique:
        raise ValueError("shot_id must be unique (one row per shot)")
    return df

df = load_wide_df(DB_PATH, USE_MATERIALIZED_WIDE)
print("[info] df is", None if df is None else df.shape)
if df is not None:
    display(df.head(PRINT_ROWS))


In [None]:
if df is None or df.empty:
    print("[warn] No data loaded. Creating a tiny mock frame so the notebook runs end-to-end.")
    df = pd.DataFrame({
        "shot_id": ["a","b","c"],
        "outcome": ["Goal","Saved","Miss"],
        "is_penalty": [False, False, False],
        "is_own_goal": [False, False, False],
        "start_x": [100.0, 90.0, 80.0],
        "start_y": [40.0, 30.0, 50.0],
        "body_part": ["Right Foot","Head","Left Foot"],
        "is_set_piece": [False, False, True],
        "is_corner": [False, False, False],
        "is_free_kick": [False, False, True],
        "first_time": [True, False, False],
        "under_pressure": [False, True, False],
        "ff_opponents": [1.0, 3.0, 0.0],
        "ff_keeper_x": [118.0, 119.0, 117.5],
        "ff_keeper_y": [40.0, 41.0, 39.5],
        "match_id": [1,1,1],
    })

df0 = df.copy()
mask_valid = (~df0["is_penalty"].astype(bool)) & (~df0["is_own_goal"].astype(bool))
data = df0.loc[mask_valid].copy()

y = (data["outcome"] == "Goal").astype(int)
LEAKY = [c for c in ["end_x","end_y","end_z"] if c in data.columns]
data = data.drop(columns=LEAKY, errors="ignore")
print("rows:", len(data), "goals:", int(y.sum()), "base rate:", float(y.mean()))


In [None]:
SB_LEN, SB_WID = 120.0, 80.0
GOAL_YC_SB = 40.0
GOAL_HALF_W_SB = (7.32/2) * (SB_WID / 68.0)  # ~3.66m scaled to SB units

def distance_sb(x, y):
    dx = SB_LEN - x
    dy = GOAL_YC_SB - y
    return np.hypot(dx, dy)

def opening_angle_deg_sb(x, y):
    left  = np.array([SB_LEN, GOAL_YC_SB - GOAL_HALF_W_SB])
    right = np.array([SB_LEN, GOAL_YC_SB + GOAL_HALF_W_SB])
    p = np.column_stack([x, y])
    v1 = left  - p
    v2 = right - p
    dot = (v1 * v2).sum(axis=1)
    n1 = np.linalg.norm(v1, axis=1)
    n2 = np.linalg.norm(v2, axis=1)
    cosang = np.clip(dot/(n1*n2), -1, 1)
    return np.degrees(np.arccos(cosang))


In [None]:
X = pd.DataFrame(index=data.index)
X["dist_sb"] = distance_sb(data["start_x"].to_numpy(), data["start_y"].to_numpy())
X["angle_deg_sb"] = opening_angle_deg_sb(data["start_x"].to_numpy(), data["start_y"].to_numpy())

def to_num(s): return s.fillna(False).astype(int) if s.dtype != float else s.fillna(0)

for col in ["is_set_piece","is_corner","is_free_kick","first_time","under_pressure"]:
    if col in data.columns:
        X[col] = to_num(data[col].astype("boolean") if col in data.columns else pd.Series(False, index=data.index))
    else:
        X[col] = 0

X["is_header"] = (data.get("body_part","") == "Head").astype(int) if "body_part" in data.columns else 0

if {"ff_keeper_x","ff_keeper_y"}.issubset(data.columns):
    X["gk_depth_sb"] = np.maximum(0.0, SB_LEN - data["ff_keeper_x"])
    X["gk_offset_sb"] = data["ff_keeper_y"] - GOAL_YC_SB
else:
    X["gk_depth_sb"] = 0.0
    X["gk_offset_sb"] = 0.0

X["ff_opponents"] = data["ff_opponents"].fillna(0) if "ff_opponents" in data.columns else 0
display(X.describe().T.head(10))


In [None]:
def grouped_split(X, y, groups):
    if groups is None or groups.isna().all() or groups.nunique() < 2:
        # fallback: simple split by index
        idx = np.arange(len(X))
        np.random.shuffle(idx)
        cut = int(0.8*len(idx))
        tr, te = idx[:cut], idx[cut:]
        return tr, te
    gkf = GroupKFold(n_splits=5)
    tr, te = next(gkf.split(X, y, groups))
    return tr, te

groups = data["match_id"] if "match_id" in data.columns else None
train_idx, test_idx = grouped_split(X, y, groups)

X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
print(X_tr.shape, X_te.shape, float(y_tr.mean()), float(y_te.mean()))


In [None]:
def safe_metrics(y_true, p):
    if len(np.unique(y_true)) < 2:
        return {"auc": np.nan, "logloss": np.nan, "brier": np.nan}
    return {
        "auc": float(roc_auc_score(y_true, p)),
        "logloss": float(log_loss(y_true, p)),
        "brier": float(brier_score_loss(y_true, p))
    }

base = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    LogisticRegression(max_iter=1000, solver="lbfgs", C=1.0)
)
base.fit(X_tr, y_tr)
p_raw = base.predict_proba(X_te)[:,1]
print("Uncalibrated:", safe_metrics(y_te, p_raw))

cal = CalibratedClassifierCV(base, method="sigmoid", cv="prefit")
cal.fit(X_tr, y_tr)
p_cal = cal.predict_proba(X_te)[:,1]
print("Calibrated:", safe_metrics(y_te, p_cal))


In [None]:
from sklearn.calibration import calibration_curve

if len(X_te) > 50 and len(np.unique(y_te)) == 2:
    prob_true, prob_pred = calibration_curve(y_te, p_cal, n_bins=10, strategy="quantile")
    plt.figure()
    plt.plot([0,1],[0,1], "--", label="perfect")
    plt.plot(prob_pred, prob_true, marker="o", label="calibrated")
    plt.xlabel("Predicted probability")
    plt.ylabel("Observed frequency")
    plt.title("Reliability — Calibrated LR")
    plt.legend(); plt.show()
else:
    print("[info] Skipping reliability plot (not enough test data).")


In [None]:
lr = cal.base_estimator.named_steps["logisticregression"]
coef = lr.coef_[0]
feat_names = X_tr.columns
coef_df = pd.DataFrame({"feature": feat_names, "coef": coef}).sort_values("coef", ascending=False)
display(coef_df.head(10)); display(coef_df.tail(10))

def distance_phrase(d):
    return "close range" if d < 12 else "medium range" if d < 18 else "long range"
def angle_phrase(a):
    return "wide angle" if a > 25 else "moderate angle" if a > 10 else "tight angle"
def defenders_phrase(n):
    if n >= 3:
        return "crowded sightline"
    if n == 0:
        return "clear sight"
    return None
def gk_phrase(depth, offset):
    parts = []
    if depth >= 6:
        parts.append("deep GK")
    if abs(offset) >= 2.5:
        parts.append("GK off-center")
    return "; ".join(parts) if parts else None

def explain_row(r: pd.Series, xg: float) -> str:
    bits = [f"{xg:.2f} xG — {distance_phrase(r['dist_sb'])}, {angle_phrase(r['angle_deg_sb'])}"]
    if r.get("first_time", 0) == 1:
        bits.append("first-time")
    if r.get("is_free_kick", 0) == 1:
        bits.append("direct free-kick")
    dp = defenders_phrase(int(r.get("ff_opponents", 0)))
    if dp:
        bits.append(dp)
    gp = gk_phrase(r.get("gk_depth_sb", 0.0), r.get("gk_offset_sb", 0.0))
    if gp:
        bits.append(gp)
    return ", ".join(bits) + "."

# demo for a few test rows (guarded)
n_demo = min(5, len(X_te))
if n_demo > 0:
    demo = X_te.head(n_demo)
    px = cal.predict_proba(demo)[:,1]
    for r, p in zip(demo.itertuples(index=False), px):
        print(explain_row(pd.Series(r._asdict(), index=demo.columns), float(p)))
else:
    print("[info] No test rows to demo explanations.")


In [None]:
from pathlib import Path
import joblib, json

ART_DIR = Path("artifacts"); ART_DIR.mkdir(exist_ok=True, parents=True)

if len(X_tr) > 0:
    joblib.dump(cal, ART_DIR / "baseline_lr_calibrated.joblib")
    with open(ART_DIR / "feature_names.json", "w") as f:
        json.dump(list(X.columns), f, indent=2)
    with open(ART_DIR / "README.txt", "w") as f:
        f.write(
            "Baseline calibrated Logistic Regression for non-penalty shots.\n"
            "Inputs: features in feature_names.json\n"
            "Target: outcome == 'Goal'\n"
            "Calibration: Platt (sigmoid) on train fold\n"
        )
    print("Saved artifacts to:", ART_DIR.resolve())
else:
    print("[info] Skipping artifact save (no training data).")


## Next steps

- Add defenders-in-cone & nearest-defender distance.
- Try isotonic calibration vs sigmoid.
- Add LightGBM (shallow) and compare Brier/LogLoss.
- Ensure GroupKFold across all folds for stable metrics.
- Plug model + explanation into FastAPI/Streamlit.
