In [1]:
import pandas as pd
from utils.utils import load_all_games_csv, get_teams, basic_win_prob_for_et
from elos.elo_tracker import EloTracker
import autograd.numpy as np
from autograd import grad
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from scipy.special import expit
from matplotlib import pyplot as plt
from typing import Tuple


# Win Probability Analysis

This notebook will compare several different methods to estimate win probabilities from Elo ratings, and possibly home advantage, travel distance, and rest days.

## Get all Games

In [16]:
all_games = load_all_games_csv('../data/gameinfo_cleaned.csv')

  all_games = pd.read_csv(filename)


In [17]:
# Drop rows with na travel distance or rest
all_games = all_games.dropna(subset=['visdistancetraveled', 'homerestdays', 'visrestdays']).copy()

In [18]:
# Max rest days
#all_games['visrestdays'] = all_games['visrestdays'].apply(lambda x: min(6,x))
#all_games['homerestdays'] = all_games['homerestdays'].apply(lambda x: min(6,x))


In [19]:
# Take cube root of distance traveled
all_games['visdistancetraveled'] = all_games['visdistancetraveled']**(1/3)

## Function for Evaluating Performance

In [20]:
def add_elos_to_games_df(games_df: pd.DataFrame, elo_prob_func=basic_win_prob_for_et) -> pd.DataFrame:
    """Returns a version of games_df with columns 'homeelo' and 'viselo' added, which
    are calculated in part with the Elo probability function, elo_prob_func.
    
    Args:
        games_df (pd.DataFrame): Table whose rows are chronologically ordered game box scores,
                including columns 'hometeam' for the home team, 'visteam' for the away team, and
                'homewon' which is True if home won and False otherwise. Each game in game_df must take
                place after the games that have already been logged for the given teams it includes.
                Must be indexed by a game id column 'gid'.
        elo_prob_func (function): Function that takes in a home elo, away elo, and game information
            (i.e. row of box scores dataframe) and produces the probability of the home team winning.
    """
    games_df = games_df.copy() # Don't modify original
    
    teams = get_teams(games_df)
    
    # First, get all Elo ratings
    et = EloTracker(teams, elo_prob_func=elo_prob_func)
    
    et.add_history(games_df)
    
    # Add raw pre-game Elo Ratings
    games_df['homeelo'] = [0.0] * len(games_df)
    games_df['viselo'] = [0.0] * len(games_df)

    for team in teams:
        for game in et.elos_map[team]:
            gid = game[0]
            elo = game[2]
            #print(elo)
        
            if games_df.loc[gid,'hometeam'] == team:
                games_df.loc[gid,'homeelo'] = elo
            else:
                games_df.loc[gid,'viselo'] = elo
                
    return games_df

In [21]:
def evaluate_elo_prob_func(games_df: pd.DataFrame, elo_prob_func=basic_win_prob_for_et) -> Tuple[float, float]:
    """Evaluates how well the given function to calculate Elo probabilties does on games_df,
    producing binary cross entropy and accuracy.
    
    Args:
        games_df (pd.DataFrame): Table whose rows are chronologically ordered game box scores,
                including columns 'hometeam' for the home team, 'visteam' for the away team, and
                'homewon' which is True if home won and False otherwise. Each game in game_df must take
                place after the games that have already been logged for the given teams it includes.
                Must be indexed by a game id column 'gid'.
        elo_prob_func (function): Function that takes in a home elo, away elo, and game information
            (i.e. row of box scores dataframe) and produces the probability of the home team winning.
    """
    
    # Add elos
    games_df = add_elos_to_games_df(games_df, elo_prob_func)
        
    games_df['homewinprob'] = games_df.apply(lambda game: elo_prob_func(game['homeelo'], game['viselo'], game), axis=1)
    bce = log_loss(games_df['homewon'], games_df['homewinprob'])
    accuracy = accuracy_score(games_df['homewon'], round(games_df['homewinprob']))
    
    return bce, accuracy

## Evaluate Simple Probability model

In [8]:
bce, accuracy = evaluate_elo_prob_func(all_games, basic_win_prob_for_et)
print(f"BCE: {bce}")
print(f"Accuracy: {accuracy}")

BCE: 0.6915706414224319
Accuracy: 0.5533529329413411


## With +28 Adjustment for Home Team

In [9]:
bce, accuracy = evaluate_elo_prob_func(all_games, lambda home_elo, away_elo, game_info: basic_win_prob_for_et(home_elo + 28, away_elo, game_info))
print(f"BCE: {bce}")
print(f"Accuracy: {accuracy}")

BCE: 0.6884828083280746
Accuracy: 0.5603014606374539


## With + 1.9% Adjustment for Home Team

In [10]:
bce, accuracy = evaluate_elo_prob_func(all_games, lambda home_elo, away_elo, game_info: basic_win_prob_for_et(home_elo*1.019, away_elo, game_info))
print(f"BCE: {bce}")
print(f"Accuracy: {accuracy}")

BCE: 0.6886684005841096
Accuracy: 0.5601941294507443


## With Logistic Regression

In [22]:
# First need to fit using some Elos - use the initial basic probability func.

df_to_fit = add_elos_to_games_df(all_games)


In [23]:
df_to_fit['elodiff'] = df_to_fit['viselo'] - df_to_fit['homeelo']
df_to_fit['restdiff'] = df_to_fit['visrestdays'] - df_to_fit['homerestdays']
df_to_fit['distancediff'] = df_to_fit['visdistancetraveled'] - 0
#df_to_fit['homediff'] =  0 - 1

features = ['elodiff', 'distancediff', 'restdiff']
X = df_to_fit[features].to_numpy()
y = df_to_fit['homewon'].astype(int).to_numpy().reshape(-1,1)

s = -np.log(10) / 400

In [24]:
# Fit via GD
w = np.zeros((3,1))
w[0,0] = s # Becomes 1 once dividing by s

step = 0.01 # Slightly higher for small gradients
iterations = 20000

for _ in range(iterations):

    z = X @ w
    y_hat = expit(z)
    
    w_grad = (1/X.shape[0]) * X.T @ (y_hat - y)
    
    w_grad[0,0] = 0
    
    #print(w_grad)
    
    w = w - step*w_grad
    
w

array([[-0.00575646],
       [ 0.01704371],
       [-0.00254494]])

In [25]:
# Convert back to interpretable coefficients for individual Elo adjustments
w = (1/s) * w
w

array([[ 1.        ],
       [-2.96079518],
       [ 0.44210084]])

In [26]:
def p(X,w):
    """Vector form Elo pdf for a tabular input."""
    z = X @ w
    return expit((-np.log(10) / 400) * z)

In [27]:
def predict_lr(home_elo, away_elo, game):
    
    elo_diff = away_elo - home_elo
    rest_day_diff = game['visrestdays'] - game['homerestdays']
    travel_diff = game['visdistancetraveled'] - 0
    #home_adv_diff = 0 - 1
    
    x = np.array([elo_diff, travel_diff, rest_day_diff]).reshape(-1,1) 
    return p(x.T, w).item()

In [28]:
bce, accuracy = evaluate_elo_prob_func(all_games, predict_lr)
print(f"BCE: {bce}")
print(f"Accuracy: {accuracy}")

BCE: 0.688622590357177
Accuracy: 0.5603455921135544
