# Baseline xG Model — La Liga 2017/18

- Data: StatsBomb Open Data (competition_id=11, season_id=90)
- Target: goal (0/1)
- Features: distance, angle, is_head
- Model: Logistic Regression (baseline), plus RF / XGB comparison
- Eval: AUC, Brier score
- Viz: shot map w/ predicted xG
- Notes: penalties excluded (common xG convention)

1) Initial Setup

In [4]:
# Imports
import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss, roc_curve

# make src importable
repo_root = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(repo_root / "src"))

from statsbomb_utils import load_season_shots  # <- your utility

# Plot defaults
plt.rcParams["figure.figsize"] = (6, 4)
plt.rcParams["axes.grid"] = True

2) Load season shots

In [5]:
BASE = str(repo_root / "data" / "statsbomb" / "data")  # cloned open-data
shots = load_season_shots(
    base_dir=BASE,
    competition_id=11,     # La Liga
    season_id=90,          # 2017/2018
    team_name=None,        # all teams; set "Barcelona" for team-only data
    include_penalties=False
)
print("Shots loaded:", shots.shape)
shots.head()


KeyError: 'home_team.name'

3) Quick sanity checks

In [None]:
print("Goal rate:", shots["goal"].mean().round(3))
print("Header share:", shots["is_head"].mean().round(3))
shots[["distance", "angle"]].describe()

# %%
# (Optional) tiny exploration plots
plt.scatter(shots["distance"], shots["goal"], alpha=0.05)
plt.xlabel("Distance to goal (SB units)"); plt.ylabel("Goal (0/1)")
plt.title("Distance vs Goal"); plt.show()

plt.scatter(shots["angle"], shots["goal"], alpha=0.05)
plt.xlabel("Angle (radians approx.)"); plt.ylabel("Goal (0/1)")
plt.title("Angle vs Goal"); plt.show()


4) Train / test split & baseline Logistic Regression

In [None]:
FEATURES = ["distance", "angle", "is_head"]
X = shots[FEATURES].copy()
y = shots["goal"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

logit = LogisticRegression(max_iter=2000)
logit.fit(X_train, y_train)
y_pred = logit.predict_proba(X_test)[:, 1]

auc = roc_auc_score(y_test, y_pred)
brier = brier_score_loss(y_test, y_pred)
print(f"Logistic Regression  AUC={auc:.3f}  Brier={brier:.3f}")

5) ROC curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label=f"Logit AUC={auc:.3f}")
plt.plot([0,1],[0,1],"--",lw=1,color="gray")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(); plt.show()

6) Shot map with predicted xG (simple scatter on SB coordinates)

In [None]:
shots["_pred_xg"] = logit.predict_proba(shots[FEATURES])[:, 1]

fig, ax = plt.subplots(figsize=(10, 6))
sc = ax.scatter(shots["x"], shots["y"], c=shots["_pred_xg"], s=12, alpha=0.85)
ax.invert_yaxis()
plt.colorbar(sc, label="Predicted xG")
ax.set_xlabel("x (0 → 120)"); ax.set_ylabel("y (0 → 80)")
ax.set_title("Shot map – Predicted xG (Logistic Regression)")
plt.show()

7) Stretch: model comparison (RF & XGB)

In [None]:
try:
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from xgboost import XGBClassifier
    models = {
        "Logistic": LogisticRegression(max_iter=2000),
        "RandomForest": RandomForestClassifier(
            n_estimators=400, max_depth=None, random_state=42, n_jobs=-1
        ),
        "XGBoost": XGBClassifier(
            n_estimators=500, learning_rate=0.05, max_depth=4,
            subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
            tree_method="hist", n_jobs=-1
        )
    }
    rows = []
    for name, m in models.items():
        m.fit(X_train, y_train)
        p = m.predict_proba(X_test)[:, 1]
        rows.append({"model": name,
                     "AUC": roc_auc_score(y_test, p),
                     "Brier": brier_score_loss(y_test, p)})
    cmp_df = pd.DataFrame(rows).sort_values("AUC", ascending=False)
    cmp_df
except Exception as e:
    print("Skipped model comparison (install xgboost to enable). Error:", e)

8) Coefficients

In [None]:
coef = pd.Series(logit.coef_[0], index=FEATURES).sort_values(ascending=False)
print("Logit coefficients:\n", coef)
coef.plot(kind="bar"); plt.title("Logistic Regression Coefficients"); plt.show()

# 8) Save minimal artifacts
out_dir = repo_root / "reports" / "figures"
out_dir.mkdir(parents=True, exist_ok=True)

# ROC image
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label=f"Logit AUC={auc:.3f}")
plt.plot([0,1],[0,1],"--",lw=1,color="gray")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curve")
plt.legend()
plt.savefig(out_dir / "roc_logit.png", dpi=180, bbox_inches="tight")
plt.close()

# Shot map image
fig, ax = plt.subplots(figsize=(10, 6))
sc = ax.scatter(shots["x"], shots["y"], c=shots["_pred_xg"], s=12, alpha=0.85)
ax.invert_yaxis()
plt.colorbar(sc, label="Predicted xG")
ax.set_xlabel("x (0 → 120)"); ax.set_ylabel("y (0 → 80)")
ax.set_title("Shot map – Predicted xG (Logistic Regression)")
plt.savefig(out_dir / "shotmap_logit.png", dpi=180, bbox_inches="tight")
plt.close()

# Coefficients CSV
coef.to_csv(repo_root / "reports" / "logit_coefficients.csv")

9) TL;DR cell for README

In [None]:
print(f"""
Baseline xG – La Liga 2017/18 (penalties excluded)
Features: {FEATURES}
Logistic Regression → AUC {auc:.3f}, Brier {brier:.3f}
Next: add contextual features (pressure, assist type, game state), 
      calibration curve, per-team/player season tables.
""")