## Setup

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.optimize import minimize

## Functions from Model Exploration Notebook

In [2]:
def setup_params(time_decay_half_life = 240,significance_tol = 0.3,high_winning_prob = 0.7):
    eps = 1/(2*time_decay_half_life) # time decay epsilon
    max_time_interval = np.log(np.log(significance_tol)/np.log(high_winning_prob)) / eps # in days, which is how far back we'll consider
    
    return eps,max_time_interval

In [3]:
def prepare_frame(df,start_date,eps,max_time_interval,prediction_surface,filter_player_universe=True):
    # 1- Total up games won
    winner_cols = [c for c in df.columns if c[0]=="W" and any(char.isdigit() for char in c)]
    loser_cols =  [c for c in df.columns if c[0]=="L" and any(char.isdigit() for char in c)]
    df[winner_cols].fillna(0);df[loser_cols].fillna(0)
    df.loc[:,'gi']= df[winner_cols].sum(axis=1)
    df.loc[:,'gj']= df[loser_cols].sum(axis=1)
    # 2 - Add time weight col
    df['dt'] = (start_date - df['Date']).dt.days.astype('int16') # An integer amount of days
    df['time_decay'] = round(np.exp(-eps*df['dt']),2)
    # 3 - Filter players
    if filter_player_universe:
        tournament_df = df[df['Date']==start_date]
        players = set(np.concatenate([tournament_df['Winner'].values,tournament_df['Loser'].values],axis=0))
        df = df[df['Winner'].isin(players) | df['Loser'].isin(players)]
    # 4 - Fliter in time
    end_date = start_date - pd.Timedelta(days=max_time_interval)
    df = df[(df['Date']>=end_date)&(df['Date']<start_date)] # Strict ineq here important!
    # 5 - Filter cols
    df = df[['Surface','Winner','Loser','time_decay','gi','gj']]
    
    # 6 - Player dict
    unique_players = set(np.concatenate([df['Winner'].values,df['Loser'].values],axis=0))
    n = len(unique_players)
    player_dict = {}
    for i,player in enumerate(unique_players):
        player_dict[player] = i
    # 7 - surface dict
    if prediction_surface == 'Grass':
        surface_dict = defaultdict(lambda: 1, {'Clay':0,'Grass':-1})
    elif prediction_surface != 'Clay':
        surface_dict = defaultdict(lambda: -1, {'Clay':0,'Grass':1})
    else:
        surface_dict = defaultdict(lambda: 1, {'Clay':-1,'Grass':0})
    surface_dict # The prediction surface should have key -1
    # 8 - Mapping function
    def map_df_row_to_keys(row,player_dict,surface_dict):
        row['Winner'] = player_dict[row['Winner']]
        row['Loser'] = player_dict[row['Loser']]
        row['Surface'] = surface_dict[row['Surface']]
        row['Surface_mult'] = 0 if row['Surface'] == -1 else 1
        return row
    # 9 - Mapping and data typing
    formatted_df = df[[c for c in df.columns if 'Rank' not in c]].apply(lambda x : map_df_row_to_keys(x,player_dict,surface_dict),axis=1)
    formatted_df.loc[:,formatted_df.columns != 'time_decay'] = formatted_df.loc[:,formatted_df.columns != 'time_decay'].astype(int) # So we can use the entries as list indecies later on

    return formatted_df,player_dict,n

In [4]:
def log_lilkihood(x,n,df):
    """
    x - a (3n,1) dimentional array with cols [alpha,W]
    n - an integer
    df - an (n,8) df
    """
    # Return -log_lilkihood, so we can max log_lilkihood by minimising -log_lilkihood
    df['aiwi'] = x[df['Winner']] * ((x[n+2*df['Winner']+df['Surface']]) ** df['Surface_mult'])
    df['ajwj'] = x[df['Loser']] *  ((x[n+2*df['Loser']+df['Surface']]) ** df['Surface_mult'])

    return -sum(df['time_decay']*(df['gi']*df['aiwi']+df['gj']*df['ajwj']-(df['gi']+df['gj'])*np.log(np.exp(df['aiwi'])+np.exp(df['ajwj']))))

## Importing Data

In [5]:
mens_df = pd.read_csv('../data/mens.csv',header=0,parse_dates=["Date"])
womens_df = pd.read_csv('../data/womens.csv',header=0,parse_dates=["Date"])

# Remove walkovers
mens_df = mens_df[mens_df['Comment']!='Walkover']
womens_df = womens_df[womens_df['Comment']!='Walkover']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Model Fitting

In [6]:
eps,max_time_interval = setup_params()
print("We'll consider ",max_time_interval/365," years worth of data")

We'll consider  1.5998560333428016  years worth of data


In [7]:
player_alphas = {}

In [8]:
%%time
for date in set(mens_df['Date'][mens_df['Date'] > pd.to_datetime('09/01/2020')]): #nb date in mm dd yyy format
    surf_lst = list(mens_df['Surface'].values)
    prediction_surface =  max(set(surf_lst), key=surf_lst.count) # Take most common surface
    
    formatted_df,player_dict,n = prepare_frame(mens_df,date,eps,max_time_interval,'Clay')
    
    x0=np.ones((3*n))
    bds = [(0.1,2.5)]*n + [(0.1,1.5)]*(2*n)
    res = minimize(log_lilkihood,x0=x0,args=(n,formatted_df),bounds=bds,options={'disp':False,'maxiter':50})
    
    alpha = res.x[:n]
    player_rankings = {}
    for key, value in player_dict.items():
        player_rankings[key]=alpha[value]
    player_alphas[date]=player_rankings
    print("Completed fitting for ",date)

Completed fitting for  2020-09-27 00:00:00
Completed fitting for  2020-09-05 00:00:00
Completed fitting for  2020-10-26 00:00:00
Completed fitting for  2020-11-09 00:00:00
Completed fitting for  2020-11-06 00:00:00
Completed fitting for  2020-09-17 00:00:00
Completed fitting for  2020-10-21 00:00:00
Completed fitting for  2020-09-12 00:00:00
Completed fitting for  2020-09-04 00:00:00
Completed fitting for  2020-10-09 00:00:00
Completed fitting for  2020-09-06 00:00:00
Completed fitting for  2020-11-12 00:00:00
Completed fitting for  2020-10-25 00:00:00
Completed fitting for  2020-10-16 00:00:00
Completed fitting for  2020-09-02 00:00:00
Completed fitting for  2020-10-27 00:00:00
Completed fitting for  2020-09-28 00:00:00
Completed fitting for  2020-11-13 00:00:00
Completed fitting for  2020-09-14 00:00:00
Completed fitting for  2020-10-22 00:00:00
Completed fitting for  2020-09-11 00:00:00
Completed fitting for  2020-09-22 00:00:00
Completed fitting for  2020-10-04 00:00:00
Completed f

KeyboardInterrupt: 

In [9]:
for date in set(mens_df['Date'][mens_df['Date'] > pd.to_datetime('09/01/2020')]):
    print(date)

2020-09-27 00:00:00
2020-09-05 00:00:00
2020-10-26 00:00:00
2020-11-09 00:00:00
2020-11-06 00:00:00
2020-09-17 00:00:00
2020-10-21 00:00:00
2020-09-12 00:00:00
2020-09-04 00:00:00
2020-10-09 00:00:00
2020-09-06 00:00:00
2020-11-12 00:00:00
2020-10-25 00:00:00
2020-10-16 00:00:00
2020-09-02 00:00:00
2020-10-27 00:00:00
2020-09-28 00:00:00
2020-11-13 00:00:00
2020-09-14 00:00:00
2020-10-22 00:00:00
2020-09-11 00:00:00
2020-09-22 00:00:00
2020-10-04 00:00:00
2020-10-05 00:00:00
2020-10-30 00:00:00
2020-11-01 00:00:00
2020-09-15 00:00:00
2020-09-03 00:00:00
2020-09-26 00:00:00
2020-09-25 00:00:00
2020-10-13 00:00:00
2020-10-15 00:00:00
2020-10-06 00:00:00
2020-10-31 00:00:00
2020-09-07 00:00:00
2020-09-24 00:00:00
2020-10-18 00:00:00
2020-11-17 00:00:00
2020-11-19 00:00:00
2020-11-15 00:00:00
2020-09-13 00:00:00
2020-10-23 00:00:00
2020-11-10 00:00:00
2020-11-07 00:00:00
2020-10-03 00:00:00
2020-10-12 00:00:00
2020-10-24 00:00:00
2020-10-02 00:00:00
2020-10-17 00:00:00
2020-09-18 00:00:00


In [12]:
import pickle

# Store data (serialize)
with open('mens_alphas_sep_2020.pickle', 'wb') as handle:
    pickle.dump(player_alphas, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
# Load data (deserialize)
with open('mens_alphas_sep_2020.pickle', 'rb') as handle:
    unserialized_data = pickle.load(handle)