# Jeke Bayesian Model Prototype

Notebook base con las dependencias del proyecto.

In [None]:
from understatapi import UnderstatClient
import pandas as pd
import numpy as np
from time import sleep
from tqdm.auto import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import pymc as pm
import arviz as az
from scipy.stats import poisson

In [None]:
# ConfiguraciÃ³n
LEAGUE = "EPL"    # "EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1", "RFPL"
SEASON = ["2022", "2023", "2024", "2025"]

In [None]:
def get_league_matches(league: str, seasons: list[str]):
    all_matches = []
    with UnderstatClient() as us:
        for season in seasons:
            matches = us.league(league=league).get_match_data(season=season)
            all_matches.extend(matches)
    return all_matches

raw_matches = get_league_matches(LEAGUE, SEASON)
print(f"Total de partidos: {len(raw_matches)}")
pprint(raw_matches[0])

In [None]:
def format_fixtures(matches: list[dict]) -> pd.DataFrame:
    rows = []
    for m in matches:
        if not m.get("isResult"):
            continue
        
        h_goals = int(m["goals"]["h"])
        a_goals = int(m["goals"]["a"])
        
        if h_goals > a_goals:
            result = "H"
        elif h_goals < a_goals:
            result = "A"
        else:
            result = "D"
        
        rows.append({
            "date": m["datetime"],
            "home_team": m["h"]["title"],
            "away_team": m["a"]["title"],
            "yg1": h_goals,
            "yg2": a_goals,
            "result": result
        })
    
    df = pd.DataFrame(rows)
    df["date"] = pd.to_datetime(df["date"])
    return df.sort_values("date").reset_index(drop=True)

fixtures = format_fixtures(raw_matches)
print(f"Partidos jugados: {len(fixtures)}")
fixtures.head()

In [None]:
n_teams = len(fixtures["home_team"].unique())

teams = (
    fixtures[["home_team"]]
    .drop_duplicates()
    .sort_values("home_team")
    .reset_index(drop=True)
    .assign(team_index=np.arange(n_teams))
    .rename(columns={"home_team": "team"})
)

df = (
    fixtures
    .merge(teams, left_on="home_team", right_on="team")
    .rename(columns={"team_index": "hg"})
    .drop(["team"], axis=1)
    .merge(teams, left_on="away_team", right_on="team")
    .rename(columns={"team_index": "ag"})
    .drop(["team"], axis=1)
    .sort_values("date")
)

print(f"Equipos: {n_teams}")
print(teams.to_string())
print()
df.head()

In [None]:
TEST_SIZE = 50

train = df.iloc[:-TEST_SIZE]
test = df.iloc[-TEST_SIZE:]

goals_home_obs = train["yg1"].values
goals_away_obs = train["yg2"].values
home_team = train["hg"].values
away_team = train["ag"].values

print(f"Train: {len(train)} partidos")
print(f"Test: {len(test)} partidos")

In [None]:
import pytensor.tensor as pt

with pm.Model() as model:
    home = pm.Flat("home")
    
    tau_att = pm.Gamma("tau_att", 0.1, 0.1)
    atts_star = pm.Normal("atts_star", mu=0, tau=tau_att, shape=n_teams)
    
    tau_def = pm.Gamma("tau_def", 0.1, 0.1)
    def_star = pm.Normal("def_star", mu=0, tau=tau_def, shape=n_teams)
    
    atts = pm.Deterministic("atts", atts_star - pt.mean(atts_star))
    defs = pm.Deterministic("defs", def_star - pt.mean(def_star))
    
    home_theta = pt.exp(home + atts[home_team] + defs[away_team])
    away_theta = pt.exp(atts[away_team] + defs[home_team])
    
    home_goals = pm.Poisson("home_goals", mu=home_theta, observed=goals_home_obs)
    away_goals = pm.Poisson("away_goals", mu=away_theta, observed=goals_away_obs)

In [None]:
with model:
    trace = pm.sample(2000, tune=1000, cores=4, return_inferencedata=True)

In [None]:
az.plot_trace(trace, var_names=["home"])
plt.tight_layout()

In [None]:
atts_samples = trace.posterior["atts"].values.reshape(-1, n_teams)

atts_df = (
    pd.DataFrame(az.hdi(trace, var_names=["atts"])["atts"].values, columns=["lower_hdi", "upper_hdi"])
    .assign(median=np.median(atts_samples, axis=0))
    .merge(teams, left_index=True, right_on="team_index")
    .drop(["team_index"], axis=1)
    .rename(columns={"team": "Team"})
    .assign(lower=lambda x: x["median"] - x["lower_hdi"])
    .assign(upper=lambda x: x["upper_hdi"] - x["median"])
    .sort_values("median", ascending=True)
)

plt.figure(figsize=(8, 10))
plt.errorbar(atts_df["median"], atts_df["Team"], xerr=(atts_df[["lower", "upper"]].values).T, fmt="o")
plt.xlabel("Attack Rating")
plt.title("Attack Ratings by Team")
plt.tight_layout()

In [None]:
defs_samples = trace.posterior["defs"].values.reshape(-1, n_teams)

defs_df = (
    pd.DataFrame(az.hdi(trace, var_names=["defs"])["defs"].values, columns=["lower_hdi", "upper_hdi"])
    .assign(median=np.median(defs_samples, axis=0))
    .merge(teams, left_index=True, right_on="team_index")
    .drop(["team_index"], axis=1)
    .rename(columns={"team": "Team"})
    .assign(lower=lambda x: x["median"] - x["lower_hdi"])
    .assign(upper=lambda x: x["upper_hdi"] - x["median"])
    .sort_values("median", ascending=False)
)

plt.figure(figsize=(8, 10))
plt.errorbar(defs_df["median"], defs_df["Team"], xerr=(defs_df[["lower", "upper"]].values).T, fmt="o")
plt.xlabel("Defence Rating")
plt.title("Defence Ratings by Team")
plt.tight_layout()

In [None]:
def goal_expectation(trace, home_team_id, away_team_id):
    home = trace.posterior["home"].values.mean()
    atts = trace.posterior["atts"].values.reshape(-1, n_teams)
    defs = trace.posterior["defs"].values.reshape(-1, n_teams)
    
    atts_home = atts[:, home_team_id].mean()
    atts_away = atts[:, away_team_id].mean()
    defs_home = defs[:, home_team_id].mean()
    defs_away = defs[:, away_team_id].mean()
    
    home_theta = np.exp(home + atts_home + defs_away)
    away_theta = np.exp(atts_away + defs_home)
    
    return home_theta, away_theta

In [None]:
def win_draw_loss(home_expectation, away_expectation, max_goals=10):
    h = poisson.pmf(range(max_goals + 1), home_expectation)
    a = poisson.pmf(range(max_goals + 1), away_expectation)
    m = np.outer(h, a)
    
    home_win = np.sum(np.tril(m, -1))
    away_win = np.sum(np.triu(m, 1))
    draw = np.sum(np.diag(m))
    
    return home_win, draw, away_win

In [None]:
home_team_name = "Liverpool"
away_team_name = "Manchester City"

home_id = teams[teams["team"] == home_team_name]["team_index"].values[0]
away_id = teams[teams["team"] == away_team_name]["team_index"].values[0]

h_exp, a_exp = goal_expectation(trace, home_id, away_id)
h_win, draw, a_win = win_draw_loss(h_exp, a_exp)

print(f"{home_team_name} vs {away_team_name}")
print(f"Goles esperados: {h_exp:.2f} - {a_exp:.2f}")
print(f"")
print(f"{'Resultado':<10} {'Prob':>8} {'Cuota':>8}")
print(f"{'-'*28}")
print(f"{'Local':<10} {h_win:>7.1%} {1/h_win:>8.2f}")
print(f"{'Empate':<10} {draw:>7.1%} {1/draw:>8.2f}")
print(f"{'Visitante':<10} {a_win:>7.1%} {1/a_win:>8.2f}")

In [None]:
def rps(predictions, outcome):
    cumulative_pred = np.cumsum(predictions)
    cumulative_actual = np.zeros(3)
    cumulative_actual[outcome:] = 1
    return np.sum((cumulative_pred - cumulative_actual) ** 2) / 2

def calculate_rps(trace, df):
    rps_list = []
    for idx, row in df.iterrows():
        if row["result"] == "H":
            outcome = 0
        elif row["result"] == "D":
            outcome = 1
        else:
            outcome = 2
        
        h, a = goal_expectation(trace, row["hg"], row["ag"])
        predictions = win_draw_loss(h, a)
        rps_list.append(rps(predictions, outcome))
    
    return np.mean(rps_list)

rps_score = calculate_rps(trace, test)
print(f"RPS promedio en test: {rps_score:.4f}")