In [2]:
import numpy as np 
import pandas as pd 
from collections import defaultdict
 
import seaborn as sns 
import matplotlib.pyplot as plt 

https://ojs.aaai.org/index.php/AIIDE/article/view/5233/5089

In [3]:
df_v2 = pd.read_csv(r'C:\Users\jcmar\my_files\SportsBetting\data\entire_odds_stats_v2.csv')


In [47]:
def elo_rating(df,k, w90=400):
    
    elo_dic = defaultdict(list)
    red_elo = []
    blue_elo = []
    mu_red_col = []
    mu_blue_col = []
    for _, row in df.iterrows(): 
        red_name = row['red_fighter']
        blue_name = row['blue_fighter']

        if red_name not in elo_dic:
            elo_dic[red_name] = [1500]
        
        if blue_name not in elo_dic:
            elo_dic[blue_name] = [1500]

        prev_blue = elo_dic[blue_name][-1] #elo pre fight
        prev_red = elo_dic[red_name][-1]
        red_elo.append(prev_red)
        blue_elo.append(prev_blue)
        
        if row['winner'] == 1 and pd.notna(row['winner']):

            d = prev_red - prev_blue
            mu_red = 1 / (1 + 10**(-d/w90))
            red_new = prev_red + k * (1-mu_red) #red wins

            d = prev_blue - prev_red
            mu_blue = 1 / (1 + 10**(-d/w90))
            blue_new = prev_blue + k * (0-mu_blue) #blue loses
            
            elo_dic[red_name].append(red_new)
            elo_dic[blue_name].append(blue_new)

        if row['winner'] == 0 and pd.notna(row['winner']):

            d = prev_blue - prev_red
            mu_blue = 1 / (1 + 10**(-d/w90))
            blue_new = prev_blue + k * (1-mu_blue) #blue wins

            d = prev_red - prev_blue
            mu_red = 1 / (1 + 10**(-d/w90))
            red_new = prev_red + k * (0-mu_red) #red loses
            
            elo_dic[red_name].append(red_new)
            elo_dic[blue_name].append(blue_new)
        
        mu_red_col.append(mu_red)
        mu_blue_col.append(mu_blue)

    return np.column_stack([red_elo, blue_elo, mu_red_col, mu_blue_col])

elo_red_blue = elo_rating(df_v2, 45, 100)
df_elo = pd.DataFrame({'elo_red':elo_red_blue[:,0], 'elo_blue':elo_red_blue[:,1],
                       'mu_red':elo_red_blue[:,2], 'mu_blue':elo_red_blue[:,3], 'winner':df_v2['winner']})

df_elo['pred'] = np.where(df_elo['elo_red']>=df_elo['elo_blue'],1,0)
df_elo['pred_mu'] = np.where(df_elo['mu_red']>=df_elo['mu_blue'],df_elo['mu_red'],df_elo['mu_blue'])
df_elo['opp_mu'] = np.where(df_elo['mu_red']<=df_elo['mu_blue'],df_elo['mu_red'],df_elo['mu_blue'])

total_correct = np.sum(np.where(df_elo['pred'] == df_elo['winner'], 1,0))
accuracy = total_correct/df_elo.shape[0]
calibration = df_elo['pred_mu'].sum() / total_correct

y_true = df_elo['winner'].values  
y_pred = df_elo['pred_mu'].values  

log_loss = -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) / df_elo.shape[0]
df_elo['log_loss'] = log_loss
print(accuracy, calibration, np.sum(log_loss))

0.574176514961026 1.1071324407147025 0.6589784350258932


In [26]:
df_elo.tail()

Unnamed: 0,elo_red,elo_blue,mu_red,mu_blue,winner,pred
7949,1557.619656,1537.964183,0.611253,0.388747,1,1
7950,1524.448753,1536.266677,0.432387,0.567613,1,0
7951,1565.574111,1549.817127,0.589722,0.410278,0,1
7952,1576.795143,1546.669855,0.666781,0.333219,1,1
7953,1518.38075,1548.39097,0.333808,0.666192,0,0


In [None]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn.metrics import log_loss, brier_score_loss
from math import log10

# ----------------------------
# Helper: expected probability
# ----------------------------
def expected_prob(r_a, r_b, scale=400.0):
    # p(A wins) = 1 / (1 + 10^((Rb - Ra)/scale))
    return 1.0 / (1.0 + 10.0 ** ((r_b - r_a) / scale))

# ----------------------------
# MOV scaling functions
# ----------------------------
def mov_linear(w):
    return max(w - 1.0, 1.0)

def mov_log(w):
    # avoid log(0)
    return np.log(150.0 * max(w - 1.0, 0.0) + 1.0)

def mov_sqrt(w):
    return np.sqrt(100.0 * max(w, 0.0))

def mov_exp(w):
    return 3.0 ** max(w, 0.0)

MOV_MAP = {
    "linear": mov_linear,
    "log": mov_log,
    "sqrt": mov_sqrt,
    "exp": mov_exp
}

# ----------------------------
# Single Elo run function
# ----------------------------
def run_elo_on_matches(matches_df,
                       base_k=20.0,
                       mov_mode="log",
                       cutoff_rating=None,
                       cutoff_k_scale=0.5,
                       w90=None,
                       regress_to_mean=0.0,
                       regress_every_n_matches=None,
                       verbose=False,
                       predict_on=None,
                       scale_override=None):
    """
    Run Elo through matches (chronological) and optionally predict on a holdout set.

    matches_df must include columns:
      - 'date' (chronological order assumed)
      - 'player_a', 'player_b' (ids)
      - 'score_a', 'score_b' (numeric, for margin)
      - 'outcome_a' (1 if A wins, else 0)  OR we compute from score

    Parameters:
      - base_k: base K factor
      - mov_mode: 'linear'|'log'|'sqrt'|'exp'
      - cutoff_rating: rating threshold above which K is scaled
      - cutoff_k_scale: multiplier when above cutoff
      - w90: rating difference corresponding to 90% win prob -> used to compute scale if provided
      - regress_to_mean: fraction to regress ratings toward global mean (0=no regression)
      - regress_every_n_matches: if not None, apply regression every N processed matches
      - predict_on: optional DataFrame of matches to collect predictions for (same schema)
      - scale_override: numeric scale used in expected_prob; if None and w90 provided, compute from w90; else default 400
    Returns:
      - preds (list of predicted probs for rows in predict_on in same order) if predict_on given
      - final_ratings dict
    """
    # copy to avoid editing
    df = matches_df.copy().reset_index(drop=True)

    # compute scale parameter
    if scale_override is not None:
        scale = float(scale_override)
    elif w90 is not None:
        p = 0.9
        denom = log10(p / (1 - p))  # ~= log10(9) ~= 0.9542
        scale = float(w90) / denom
    else:
        scale = 400.0

    mov_func = MOV_MAP.get(mov_mode)
    if mov_func is None:
        raise ValueError("mov_mode must be one of " + ", ".join(MOV_MAP.keys()))

    # ratings store
    ratings = {}
    default_rating = 1500.0
    # optional K per player (kept simple here as constant base_k, but could be per-player)
    # processed count for regression schedule
    processed = 0

    # helper to get rating
    def get_rating(player):
        return ratings.get(player, default_rating)

    # predictions storage if asked
    preds = []
    pred_ids = []

    # iterate chronologically
    for idx, row in df.iterrows():
        a = row['player_a']
        b = row['player_b']
        sa = row.get('outcome_a', None)
        if sa is None:
            sa = 1 if row['score_a'] > row['score_b'] else 0
        sb = 1 - sa

        ra = get_rating(a)
        rb = get_rating(b)

        # expected prob using scale
        pa = expected_prob(ra, rb, scale=scale)

        # margin of victory
        w = abs(row['score_a'] - row['score_b'])
        # ensure w>0 for some functions
        if w <= 0:
            w = 1.0

        k_mult = mov_func(w)

        # apply cutoff scaling if rating above threshold (apply if either or both > cutoff)
        effective_k = base_k * k_mult
        if cutoff_rating is not None:
            # if both above cutoff, scale down (you can change this rule)
            if ra >= cutoff_rating and rb >= cutoff_rating:
                effective_k *= cutoff_k_scale

        # update ratings
        delta = effective_k * (sa - pa)
        ratings[a] = ra + delta
        ratings[b] = rb - delta

        processed += 1
        # optional regression to mean periodically
        if regress_to_mean and regress_every_n_matches and (processed % regress_every_n_matches == 0):
            # regress all ratings toward mean rating
            if len(ratings) > 0:
                mean_rating = np.mean(list(ratings.values()))
                for k in list(ratings.keys()):
                    ratings[k] = ratings[k] + regress_to_mean * (default_rating - ratings[k])

    # If prediction required on a separate DataFrame (e.g., validation set), compute probs using final ratings
    if predict_on is not None:
        preds = []
        for _, row in predict_on.reset_index(drop=True).iterrows():
            a = row['player_a']; b = row['player_b']
            ra = ratings.get(a, default_rating)
            rb = ratings.get(b, default_rating)
            p = expected_prob(ra, rb, scale=scale)
            preds.append(p)
        return np.array(preds), ratings

    return None, ratings

# ------------------------------------------------------------
# Cross-validation grid search (expanding window)
# ------------------------------------------------------------
def scope_grid_search(matches_df,
                      param_grid,
                      n_splits=5,
                      metric="logloss",
                      initial_train_frac=0.2,
                      val_window_frac=0.1,
                      verbose=True):
    """
    Grid search for SCOPE-style Elo parameters using expanding time-based CV.

    matches_df:
      chronological DataFrame with columns: 'date' (or already chronological index), 'player_a','player_b','score_a','score_b'
    param_grid: dict of lists, e.g.
       {
         "base_k": [10, 20],
         "mov_mode": ["log", "sqrt"],
         "cutoff_rating": [None, 1800],
         "cutoff_k_scale": [0.5, 1.0],
         "w90": [None, 400],
         "regress_to_mean": [0.0, 0.1],
         "regress_every_n_matches": [None, 1000]
       }
    n_splits: how many expanding folds
    metric: 'logloss' or 'brier'
    Returns:
      DataFrame with param combo and mean CV metric (lower better). Sorted ascending.
    """

    df = matches_df.copy().reset_index(drop=True)
    N = len(df)
    train_start = 0
    results = []

    # build list of candidate param tuples
    keys = list(param_grid.keys())
    combos = list(product(*[param_grid[k] for k in keys]))
    total = len(combos)
    if verbose:
        print(f"Running {total} parameter combinations")

    # prepare split boundaries
    init_train = int(N * initial_train_frac)
    val_window = int(N * val_window_frac)
    if init_train < 10 or val_window < 1:
        raise ValueError("initial_train_frac or val_window_frac too small for dataset size")

    # create split start indices (expanding)
    starts = []
    step = (N - init_train - val_window) // max(1, (n_splits - 1))
    for i in range(n_splits):
        tr_end = init_train + i * step
        val_start = tr_end
        val_end = min(val_start + val_window, N)
        starts.append((0, tr_end, val_start, val_end))

    for combo_idx, combo in enumerate(combos, 1):
        params = dict(zip(keys, combo))
        cv_scores = []

        for (tr0, tr_end, val_start, val_end) in starts:
            if val_end <= val_start or tr_end <= tr0:
                continue
            train_df = df.iloc[tr0:tr_end].reset_index(drop=True)
            val_df = df.iloc[val_start:val_end].reset_index(drop=True)

            # Run Elo on training matches
            _, ratings = run_elo_on_matches(train_df,
                                            base_k=params.get("base_k", 20.0),
                                            mov_mode=params.get("mov_mode", "log"),
                                            cutoff_rating=params.get("cutoff_rating", None),
                                            cutoff_k_scale=params.get("cutoff_k_scale", 1.0),
                                            w90=params.get("w90", None),
                                            regress_to_mean=params.get("regress_to_mean", 0.0),
                                            regress_every_n_matches=params.get("regress_every_n_matches", None),
                                            verbose=False)

            # Predict probs for validation using learned ratings
            preds = []
            y_true = []
            # compute expected with scale derived from w90 or default 400
            if params.get("w90") is not None:
                p = 0.9
                denom = log10(p / (1 - p))
                scale = float(params.get("w90")) / denom
            else:
                scale = 400.0

            for _, row in val_df.iterrows():
                a = row['player_a']; b = row['player_b']
                ra = ratings.get(a, 1500.0)
                rb = ratings.get(b, 1500.0)
                preds.append(expected_prob(ra, rb, scale=scale))
                y_true.append(1 if row.get('outcome_a', None) == 1 else (1 if row['score_a'] > row['score_b'] else 0))

            preds = np.clip(np.array(preds), 1e-12, 1 - 1e-12)
            y_true = np.array(y_true)

            if metric == "logloss":
                sc = log_loss(y_true, preds)
            elif metric == "brier":
                sc = np.mean((preds - y_true) ** 2)
            else:
                raise ValueError("metric must be 'logloss' or 'brier'")

            cv_scores.append(sc)

        if len(cv_scores) == 0:
            mean_score = np.nan
        else:
            mean_score = float(np.mean(cv_scores))

        row = params.copy()
        row['cv_score'] = mean_score
        results.append(row)

        if verbose and combo_idx % max(1, total // 10) == 0:
            print(f"Combo {combo_idx}/{total} done, cv_score={mean_score:.4f}")

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('cv_score').reset_index(drop=True)
    return results_df

# ---------------------------
# Example usage:
# ---------------------------
# matches_df needs columns: date/player_a/player_b/score_a/score_b  (chronological order)
# param_grid = {
#   "base_k": [10, 20, 40],
#   "mov_mode": ["linear","log","sqrt"],
#   "cutoff_rating":[None, 1800],
#   "cutoff_k_scale":[0.5, 1.0],
#   "w90":[None, 300, 500],
#   "regress_to_mean":[0.0, 0.05],
#   "regress_every_n_matches":[None, 1000]
# }
# results_df = scope_grid_search(matches_df, param_grid, n_splits=5, metric="logloss")
# print(results_df.head())