# Jeke xG Lite

Versión simplificada para calcular inputs Poisson a partir de xG.

In [None]:
from understatapi import UnderstatClient
import pandas as pd
import numpy as np
from time import sleep
from tqdm.auto import tqdm
from scipy.optimize import curve_fit

In [None]:
# Ligas disponibles: "EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1", "RFPL"
LEAGUE = "EPL"
SEASON = ["2025"]

PITCH_LENGTH_M = 100.0
PITCH_WIDTH_M = 65.0
GOAL_CENTER_Y = 0.5

# Parámetros de binning
BIN_WIDTH = 1.0
ALPHA = 1.0
LOW_N = 10
D_MIN, D_MAX = 2.0, 33.0

## 1. Descarga de datos

In [None]:
def get_league_match_ids(league: str, seasons: list[str]):
    """Retorna lista de match_ids y metadatos por liga y temporadas."""
    all_match_ids = []
    all_matches = []
    with UnderstatClient() as us:
        for season in seasons:
            matches = us.league(league=league).get_match_data(season=season)
            all_match_ids.extend([m["id"] for m in matches])
            all_matches.extend(matches)
    return all_match_ids, all_matches

def fetch_match_shots(match_id: str, client: UnderstatClient) -> list[dict]:
    """Obtiene tiros de un partido."""
    out = []
    md = client.match(match=match_id).get_shot_data()
    for side in ("h", "a"):
        for s in md.get(side, []):
            s = dict(s)
            s["h_a"] = side
            s["match_id"] = match_id
            out.append(s)
    return out

def shots_to_df(shots: list[dict]) -> pd.DataFrame:
    """Convierte lista de tiros a DataFrame."""
    rows = []
    for s in shots:
        rows.append({
            "match_id": s.get("match_id"),
            "minute": int(s.get("minute")),
            "team": s.get("h_team") if s.get("h_a") == "h" else s.get("a_team"),
            "h_a": s.get("h_a"),
            "player": s.get("player"),
            "result": s.get("result"),
            "X": float(s.get("X")),
            "Y": float(s.get("Y")),
            "is_goal": 1 if s.get("result") == "Goal" else 0,
            "situation": s.get("situation")
        })
    df = pd.DataFrame(rows)
    return df.dropna(subset=["X", "Y"]).reset_index(drop=True)

In [None]:
# Descargar IDs de partidos
match_ids, raw_matches = get_league_match_ids(LEAGUE, SEASON)
print(f"Partidos encontrados: {len(match_ids)}")

# Descargar tiros
all_shots = []
with UnderstatClient() as us:
    for mid in tqdm(match_ids, desc=f"Descargando tiros {LEAGUE}"):
        try:
            all_shots.extend(fetch_match_shots(mid, us))
        except Exception:
            pass
        sleep(0.15)

# Convertir a DataFrame y filtrar penales/tiros libres
df = shots_to_df(all_shots)
df = df[~df["situation"].isin(["Penalty", "DirectFreekick"])]
print(f"Tiros (open play): {len(df)}")

## 2. Modelo xG Exponencial

In [None]:
# Estandarizar coordenadas (todos los tiros hacia X=1)
mask = df["X"] < 0.5
df.loc[mask, "X"] = 1 - df.loc[mask, "X"]

# Calcular distancias
df["dist_long_m"] = (1.0 - df["X"]) * PITCH_LENGTH_M
df["dist_lateral_m"] = np.abs(df["Y"] - GOAL_CENTER_Y) * PITCH_WIDTH_M
df["dist_euclid_m"] = np.sqrt(df["dist_long_m"]**2 + df["dist_lateral_m"]**2)

In [None]:
# Binning para ajustar el modelo
bins = np.arange(0.0, np.ceil(df["dist_euclid_m"].max()) + BIN_WIDTH, BIN_WIDTH)
df["dist_bin"] = pd.cut(df["dist_euclid_m"], bins=bins, right=False)

bin_stats = (
    df.groupby("dist_bin", observed=True)
    .agg(
        shots=("is_goal", "size"),
        goals=("is_goal", "sum"),
        d_min=("dist_euclid_m", "min"),
        d_max=("dist_euclid_m", "max")
    )
    .dropna(subset=["d_min", "d_max"])
    .reset_index(drop=True)
)
bin_stats["d_mid"] = 0.5 * (bin_stats["d_min"] + bin_stats["d_max"])
bin_stats["p_goal"] = (bin_stats["goals"] + ALPHA) / (bin_stats["shots"] + 2 * ALPHA)

# Filtrar bins para regresión
fit_bins = bin_stats[
    (bin_stats["shots"] >= LOW_N) &
    (bin_stats["d_mid"] >= D_MIN) &
    (bin_stats["d_mid"] <= D_MAX)
].copy()

# Ajustar modelo exponencial
def exp_decay_xg(d, k, a, b):
    return np.exp(-d / k) * a + b

d_data = fit_bins["d_mid"].values
p_data = fit_bins["p_goal"].values
w_data = fit_bins["shots"].values

popt, _ = curve_fit(exp_decay_xg, d_data, p_data, p0=[5.0, 0.9, 0.03], sigma=1/np.sqrt(w_data), maxfev=5000)
k_opt, a_opt, b_opt = popt

print(f"Modelo: xG = e^(-d/{k_opt:.2f}) × {a_opt:.4f} + {b_opt:.4f}")

In [None]:
# Aplicar modelo a todos los tiros
df["xg"] = np.clip(exp_decay_xg(df["dist_euclid_m"], k_opt, a_opt, b_opt), 1e-9, 1 - 1e-9)

# Crear columna venue
df["venue"] = df["h_a"].map({"h": "home", "a": "away"})
df["venue"] = pd.Categorical(df["venue"], categories=["home", "away"], ordered=False)

print(f"xG calculado para {len(df)} tiros")

## 3. Inputs Poisson

In [None]:
def build_match_xg_matrix(df):
    """Crea matriz con xG por partido (home/away)."""
    agg = (
        df.groupby(["match_id", "team", "venue"], observed=True, as_index=False)
        .agg(xg=("xg", "sum"))
    )
    home = agg[agg["venue"] == "home"][["match_id", "team", "xg"]].rename(
        columns={"team": "home_team", "xg": "home_xg"}
    )
    away = agg[agg["venue"] == "away"][["match_id", "team", "xg"]].rename(
        columns={"team": "away_team", "xg": "away_xg"}
    )
    return home.merge(away, on="match_id", how="inner")

match_xg = build_match_xg_matrix(df)
print(f"Partidos en matriz: {len(match_xg)}")

In [None]:
def get_poisson_inputs(match_xg, home_team: str, away_team: str) -> dict:
    """
    Genera los 6 inputs para modelo Poisson:
    - promedio_liga_local/visitante
    - local_a_favor/en_contra
    - visitante_a_favor/en_contra
    """
    # Promedios de liga
    liga_local = match_xg["home_xg"].mean()
    liga_visitante = match_xg["away_xg"].mean()
    
    # Equipo local (jugando en casa)
    home_matches = match_xg[match_xg["home_team"] == home_team]
    local_favor = home_matches["home_xg"].mean() if len(home_matches) else np.nan
    local_contra = home_matches["away_xg"].mean() if len(home_matches) else np.nan
    
    # Equipo visitante (jugando fuera)
    away_matches = match_xg[match_xg["away_team"] == away_team]
    visitante_favor = away_matches["away_xg"].mean() if len(away_matches) else np.nan
    visitante_contra = away_matches["home_xg"].mean() if len(away_matches) else np.nan
    
    return {
        "promedio_liga_local": round(liga_local, 4),
        "promedio_liga_visitante": round(liga_visitante, 4),
        "local_a_favor": round(local_favor, 4),
        "local_en_contra": round(local_contra, 4),
        "visitante_a_favor": round(visitante_favor, 4),
        "visitante_en_contra": round(visitante_contra, 4),
    }

# Equipos disponibles
equipos = sorted(pd.unique(pd.concat([match_xg["home_team"], match_xg["away_team"]])))
print("Equipos disponibles:")
for i, e in enumerate(equipos):
    print(f"  {i}: {e}")

In [None]:
# Ejemplo de uso
HOME = "Leeds"
AWAY = "Fulham"

inputs = get_poisson_inputs(match_xg, HOME, AWAY)

print(f"\n{HOME} vs {AWAY}")
print("=" * 40)
for k, v in inputs.items():
    print(f"{k}: {v}")