In [1]:
import sys

sys.path.append('../src/')
sys.path.append('../src/sackmann')

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, accuracy_score
from tennisMatchProbability import matchProb
from data_functions import *

years = [2014, 2015, 2016]
df = pd.read_csv('../match_data_constructed/match_df_06_15_2021.csv')

# Results

In [3]:
for year in years:
    sub_df = df[(df['match_year'] == year)]
    sub_df = sub_df[sub_df['tny_name']!='ATP Challenger Tour Finals CH']
    sub_df = sub_df[sub_df['tny_name']!='Davis Cup']
    
    print('year: {}, (n={})'.format(year, sub_df.shape[0]))
    
    s_pred = np.concatenate([sub_df['avg_52_s'], sub_df['avg_52_s']], axis=0)
    s_true = np.concatenate([sub_df['p0_s_pct_obsv'], sub_df['p1_s_pct_obsv']], axis=0)

    sr_cols = ['s_kls','s_kls_EM','s_sf_kls', 's_sf_kls_EM','s_adj_kls','s_adj_kls_EM','s_kls_elo',
               's_commop_kls', 'commop_s_pct', 's_baseline']
    for col in sr_cols:
        print('\t', col)
        s_pred = np.concatenate([sub_df['p0_'+col], sub_df['p1_'+col]], axis=0)
        s_true = np.concatenate([sub_df['p0_s_pct_obsv'], sub_df['p1_s_pct_obsv']], axis=0)
        print('\t', np.mean((s_true - s_pred)**2)**.5)

year: 2014, (n=2488)
	 s_kls
	 0.08462678746748534
	 s_kls_EM
	 0.08231875070296447
	 s_sf_kls
	 0.08829716008831531
	 s_sf_kls_EM
	 0.0849889515700743
	 s_adj_kls
	 0.08289654745245684
	 s_adj_kls_EM
	 0.08208069260261054
	 s_kls_elo
	 0.08041106195022933
	 s_commop_kls
	 0.12566877851999825
	 commop_s_pct
	 0.09566609908643872
	 s_baseline
	 0.09186777825841203
year: 2015, (n=2540)
	 s_kls
	 0.0915845182670169
	 s_kls_EM
	 0.09089582160626646
	 s_sf_kls
	 0.09679856604461995
	 s_sf_kls_EM
	 0.09438843716535182
	 s_adj_kls
	 0.08982010201972893
	 s_adj_kls_EM
	 0.08941681032571608
	 s_kls_elo
	 0.08904113564848239
	 s_commop_kls
	 0.13463850796475682
	 commop_s_pct
	 0.10456241007094708
	 s_baseline
	 0.10146801504487638
year: 2016, (n=2594)
	 s_kls
	 0.08450666147881826
	 s_kls_EM
	 0.08232768041652179
	 s_sf_kls
	 0.09042685777028134
	 s_sf_kls_EM
	 0.0871968706792498
	 s_adj_kls
	 0.08247261322102856
	 s_adj_kls_EM
	 0.08182700371862718
	 s_kls_elo
	 0.07976120836969487
	 s_commop_

In [4]:
for year in years:
    sub_df = df[(df['match_year'] == year)]
    sub_df = sub_df[sub_df['tny_name']!='ATP Challenger Tour Finals CH']
    sub_df = sub_df[sub_df['tny_name']!='Davis Cup']
    
    print('year: {}, (n={})'.format(year, sub_df.shape[0]))
    
    prob_cols = [
        u'match_prob_kls', u'match_prob_kls_EM', u'match_prob_sf_kls', u'match_prob_sf_kls_EM',
        u'match_prob_adj_kls', u'match_prob_adj_kls_EM',
        u'match_prob_commop_kls', u'elo_prob', u'match_prob_commop'
    ]
    for col in prob_cols:
        print('\t', col)
        print('\t', 'accuracy score: ', accuracy_score(1 - sub_df['winner'], sub_df[col] >=.5))
        print('\t', 'log loss: ',log_loss(1 - sub_df['winner'], sub_df[col]))
        

year: 2014, (n=2488)
	 match_prob_kls
	 accuracy score:  0.6483118971061094
	 log loss:  0.6481506218548038
	 match_prob_kls_EM
	 accuracy score:  0.6563504823151125
	 log loss:  0.6127666447137045
	 match_prob_sf_kls
	 accuracy score:  0.6330385852090032
	 log loss:  0.7053562253758375
	 match_prob_sf_kls_EM
	 accuracy score:  0.6346463022508039
	 log loss:  0.6280437730984915
	 match_prob_adj_kls
	 accuracy score:  0.6780546623794212
	 log loss:  0.6372823008893149
	 match_prob_adj_kls_EM
	 accuracy score:  0.677652733118971
	 log loss:  0.6246317083827804
	 match_prob_commop_kls
	 accuracy score:  0.6567524115755627
	 log loss:  0.9933887463130496
	 elo_prob
	 accuracy score:  0.6913183279742765
	 log loss:  0.5891738699332688
	 match_prob_commop
	 accuracy score:  0.6358520900321544
	 log loss:  0.6281730637340442
year: 2015, (n=2540)
	 match_prob_kls
	 accuracy score:  0.6503937007874016
	 log loss:  0.6335279074306568
	 match_prob_kls_EM
	 accuracy score:  0.6519685039370079
	 lo

# Case Studies

In [5]:
columns = [
    'p0_52_swon', 'p0_52_svpt', 'p0_52_rwon', 'p0_52_rpt',
    'p1_52_swon', 'p1_52_svpt', 'p1_52_rwon', 'p1_52_rpt',
    'p0_s_kls',
    'p1_s_kls',
    'p0_s_pct', 'p0_r_pct',
    'p0_s_pct_EM', 'p0_r_pct_EM',
    'p1_s_pct', 'p1_r_pct',
    'p1_s_pct_EM', 'p1_r_pct_EM',
    'p0_s_kls_EM',
    'p1_s_kls_EM',
    'p0_52_s_adj',
    'p1_52_s_adj',
    'p0_s_adj_kls',
    'p1_s_adj_kls',
    'p0_elo_538',
    'p1_elo_538',
    'avg_52_s',
    'tny_stats',
    'tny_name'
]

## part 1

In [6]:
current_match_index = (df['p0_name'] == 'Daniel Elahi') & (df['p1_name'] == 'Ivo Karlovic')
subDf = df[current_match_index]
subDf[columns]

Unnamed: 0,p0_52_swon,p0_52_svpt,p0_52_rwon,p0_52_rpt,p1_52_swon,p1_52_svpt,p1_52_rwon,p1_52_rpt,p0_s_kls,p1_s_kls,...,p1_s_kls_EM,p0_52_s_adj,p1_52_s_adj,p0_s_adj_kls,p1_s_adj_kls,p0_elo_538,p1_elo_538,avg_52_s,tny_stats,tny_name
155344,51.0,64.0,22.0,67.0,3516.0,4654.0,1409.0,4903.0,0.892543,0.810164,...,0.766265,0.183006,0.192259,0.851085,0.869031,1585.931324,1952.853601,0.642253,0.66755,Bogota


In [7]:
subDf['p0_s_pct_EM'].values[0], subDf['p0_r_pct_EM'].values[0], subDf['p1_s_pct_EM'].values[0], subDf['p1_r_pct_EM'].values[0]

(0.659894562319806,
 0.36482642219914857,
 0.7480477942211473,
 0.29348880588928805)

In [8]:
print(matchProb(subDf['p0_s_kls'].values[0], 1 - subDf['p1_s_kls'].values[0]))
print(matchProb(subDf['p0_s_kls_EM'].values[0], 1 - subDf['p1_s_kls_EM'].values[0]))

d = (subDf['p1_elo_538'].values[0] - subDf['p0_elo_538'].values[0]) / 400
print((1 + 10 ** (d))** (-1))

0.809452578860394
0.427671812550951
0.10791915170229861


## part 2

In [9]:
current_match_index = (df['p0_name'] == 'Mikhail Youzhny') & (df['p1_name'] == 'Nick Kyrgios')
subDf = df[current_match_index]
subDf[columns]

Unnamed: 0,p0_52_swon,p0_52_svpt,p0_52_rwon,p0_52_rpt,p1_52_swon,p1_52_svpt,p1_52_rwon,p1_52_rpt,p0_s_kls,p1_s_kls,...,p1_s_kls_EM,p0_52_s_adj,p1_52_s_adj,p0_s_adj_kls,p1_s_adj_kls,p0_elo_538,p1_elo_538,avg_52_s,tny_stats,tny_name
152904,1828.0,2960.0,1145.0,2947.0,900.0,1370.0,424.0,1323.0,0.644558,0.615878,...,0.612827,0.050583,0.106202,0.627976,0.641336,1941.650048,1931.066267,0.637322,0.622117,US Open


In [10]:
print(matchProb(subDf['p0_s_kls'].values[0], 1 - subDf['p1_s_kls'].values[0]))
print(matchProb(subDf['p0_s_adj_kls'].values[0], 1 - subDf['p1_s_adj_kls'].values[0]))

0.6410482694317813
0.433449491719355


In [11]:
## generating adj stats up until this specific match to view player_stats before match started...
start_ind=0
players_stats = {}
match_52_stats = np.zeros([2,len(df),2])

w_l = ['p0','p1']
for i, row in df.loc[start_ind:].iterrows():
    
    surface = row['surface']
    date = row['match_year'],row['match_month']
    avg_52_s, avg_52_r = row['avg_52_s'],row['avg_52_r']
    match_stats = [[],[]]

    # add new players to the dictionary
    for k,label in enumerate(w_l):
        if row[label+'_name'] not in players_stats:
            players_stats[row[label+'_name']] = adj_stats_52(date)

    # store pre-match adj stats
    for k,label in enumerate(w_l):
        players_stats[row[label+'_name']].set_month(date)

        # fill in player's adjusted stats prior to start of match
        match_52_stats[k][i] = players_stats[row[label+'_name']].adj_sr
        # update serving stats if not null
        if validate(row, label):
            sv_stats = (row[label+'_swon'],row[label+'_svpt'],row[label+'_rwon'],row[label+'_rpt'])

            opp_r_ablty = players_stats[row[w_l[1-k]+'_name']].adj_sr[1] + avg_52_r
            opp_s_ablty = players_stats[row[w_l[1-k]+'_name']].adj_sr[0] + avg_52_s
            opp_stats = (opp_r_ablty * row[label + '_svpt'], opp_s_ablty * row[label + '_rpt'])
            match_stats[k] = sv_stats + opp_stats

    # break before updating player stats
    if row['p0_name'] == 'Mikhail Youzhny' and row['p1_name'] == 'Nick Kyrgios':
        break    
        
    # update players' adjusted scores based on pre-match adjusted ratings
    for k,label in enumerate(w_l):
        # if is_valid(match_stats):
        if validate(row, label) and is_valid(match_stats):
            players_stats[row[label+'_name']].update(date,match_stats[k])

In [12]:
self = players_stats['Mikhail Youzhny']

s_pt, r_pt = np.sum(self.last_year[:,1]), np.sum(self.last_year[:,3])
f_i = np.sum(self.last_year[:,0])/s_pt
f_adj = 1 - np.sum(self.last_year[:,4])/s_pt
g_i = np.sum(self.last_year[:,2])/r_pt
g_adj = 1 - np.sum(self.last_year[:,5])/r_pt

print('s_pt: ', s_pt)
print('r_pt: ', r_pt)
print('1 - f_adj: ', 1 - f_adj)
print('f_adj * s_pt: ', f_adj * s_pt)
print('adj serve stat: ', self.adj_sr[0])
print('1 - g_adj: ', 1 - g_adj)
print('g_adj * r_pt: ', g_adj * r_pt)
print('adj return stat: ', self.adj_sr[1])

s_pt:  2960.0
r_pt:  2947.0
1 - f_adj:  0.4332356726830806
f_adj * s_pt:  1677.6224088580814
adj serve stat:  0.050803240250648196
1 - g_adj:  0.6982281449280324
g_adj * r_pt:  889.3216568970885
adj return stat:  0.08675885412382478


In [13]:
self = players_stats['Nick Kyrgios']

s_pt, r_pt = np.sum(self.last_year[:,1]), np.sum(self.last_year[:,3])
f_i = np.sum(self.last_year[:,0])/s_pt
f_adj = 1 - np.sum(self.last_year[:,4])/s_pt
g_i = np.sum(self.last_year[:,2])/r_pt
g_adj = 1 - np.sum(self.last_year[:,5])/r_pt

print('s_pt: ', s_pt)
print('r_pt: ', r_pt)
print('1 - f_adj: ', 1 - f_adj)
print('f_adj * s_pt: ', f_adj * s_pt)
print('adj serve stat: ', self.adj_sr[0])
print('1 - g_adj: ', 1 - g_adj)
print('g_adj * r_pt: ', g_adj * r_pt)
print('adj return stat: ', self.adj_sr[1])

s_pt:  1370.0
r_pt:  1323.0
1 - f_adj:  0.4485836524284619
f_adj * s_pt:  755.4403961730072
adj serve stat:  0.10551795899780492
1 - g_adj:  0.7236624232639385
g_adj * r_pt:  365.5946140218094
adj return stat:  0.044146172319116106


## part 3

In [14]:
current_match_index = (df['p0_name'] == 'Gael Monfils') & (df['p1_name'] == 'Kei Nishikori') & (df['tny_name'] == 'Olympics')
subDf = df[current_match_index]
subDf[columns + ['p0_s_kls_elo', 'p1_s_kls_elo']]

Unnamed: 0,p0_52_swon,p0_52_svpt,p0_52_rwon,p0_52_rpt,p1_52_swon,p1_52_svpt,p1_52_rwon,p1_52_rpt,p0_s_kls,p1_s_kls,...,p1_52_s_adj,p0_s_adj_kls,p1_s_adj_kls,p0_elo_538,p1_elo_538,avg_52_s,tny_stats,tny_name,p0_s_kls_elo,p1_s_kls_elo
158451,2345.0,3533.0,1433.0,3608.0,3309.0,5069.0,2103.0,5229.0,0.62633,0.620387,...,0.100497,0.603731,0.620936,2140.559471,2295.556094,0.635231,0.635231,Olympics,0.601963,0.645494


In [15]:
print(matchProb(subDf['p0_s_kls'].values[0], 1 - subDf['p1_s_kls'].values[0]))
print(1 - matchProb(subDf['p0_s_kls_elo'].values[0], 1 - subDf['p1_s_kls_elo'].values[0]))

0.530036323749347
0.7093506224496224


In [16]:
overall_serve = subDf['p0_s_kls'].values[0] + subDf['p1_s_kls'].values[0]
win_prob = matchProb(subDf['p0_s_kls_elo'].values[0], 1 - subDf['p1_s_kls_elo'].values[0])
kovalchik_induced_s(win_prob, overall_serve - 1)

(0.6016008080005122, 0.6451166110894735)