In [6]:
# test if adjusted stats script works
from helper_functions import *
from data_functions import *
import pandas as pd
import numpy as np
import re
import math
import copy

TOUR = 'atp'
COUNT = False
START_YEAR = 2010
ONLY_PBP = 1
pd.options.mode.chained_assignment = None

atp_year_list = []
for i in xrange(1968,2018):
    atp_year_list.append(pd.read_csv("../tennis_data/"+TOUR+"/"+TOUR+"_matches_{0}.csv".format(i)))
df = pd.concat(atp_year_list, ignore_index = True)

# these may be changes specific to atp dataframe; normalize_name() is specific to atp/wta...
df = df.rename(columns={'winner_name':'w_name','loser_name':'l_name','tourney_id':'tny_id',\
                        'tourney_name':'tny_name','tourney_date':'tny_date'})
df['w_name'] = [normalize_name(x,tour=TOUR) for x in df['w_name']]
df['l_name'] = [normalize_name(x,tour=TOUR) for x in df['l_name']]
df['tny_name'] = ['Davis Cup' if 'Davis Cup' in s else s for s in df['tny_name']]
df['tny_name'] = [s.replace('Australian Chps.','Australian Open').replace('Australian Open-2',\
            'Australian Open').replace('U.S. National Chps.','US Open') for s in df['tny_name']]

ret_strings = ('ABN','DEF','In Progress','RET','W/O',' RET',' W/O','nan','walkover')
abd_strings = ('abandoned','ABN','ABD','DEF','def','unfinished','Walkover')
atp_all_matches = format_match_df(df,ret_strings=ret_strings,abd_strings=abd_strings)

# generate tourney stats from one year behind START_DATE for stats_52
# get elo with constant and dynamic K
start_ind = atp_all_matches[atp_all_matches['match_year']>=START_YEAR-2].index[0]
atp_all_matches = generate_elo(atp_all_matches,0)
atp_all_matches = generate_elo(atp_all_matches,1)
atp_all_matches = generate_52_stats(atp_all_matches,start_ind)
atp_all_matches = generate_52_adj_stats(atp_all_matches,start_ind)
atp_all_matches = generate_tny_stats(atp_all_matches,start_ind)

In [7]:
# Combine all the matches that have pbp (point by point) information into one dataframe
# and clean up columns in preparation for merging with all_atp_matches
pbp_matches_archive = pd.read_csv("../tennis_data/pbp_matches_atp_main_archive.csv")
pbp_matches_archive_old = pd.read_csv("../tennis_data/pbp_matches_atp_main_archive_old.csv")
pbp_matches_current = pd.read_csv("../tennis_data/pbp_matches_atp_main_current.csv")
pbp_matches = pd.concat([pbp_matches_archive_old.loc[:932],pbp_matches_archive,pbp_matches_current])
pbp_matches.winner = pbp_matches.winner - 1
pbp_matches = pbp_matches.reset_index(); del pbp_matches['index']
pbp_matches = format_pbp_df(pbp_matches,tour=TOUR)


# dictionary with each key as 'w_name'+'l_name'+'match_year'+'score' to connect pbp 
# strings to atp_all_matches (I removed parentheses terms from tb scores)
duplicates = ['Janko Tipsarevic Kei Nishikori 2011 6-4 6-4','Robin Soderling Michael Berrer 2011 6-3 7-6',
        'Juan Martin Kevin Anderson 2011 6-4 6-4','Philipp Kohlschreiber Mikhail Youzhny 2011 6-4 6-2',
        'Philipp Kohlschreiber Olivier Rochus 2012 6-1 6-4','Viktor Troicki Radek Stepanek 2012 2-6 6-4 6-3',
        'Gilles Simon Grigor Dimitrov 2012 6-3 6-3','Alexandr Dolgopolov Gilles Simon 2012 6-3 6-4',
        'Fabio Fognini Tommy Haas 2013 6-2 6-4','Richard Gasquet Florian Mayer 2013 6-3 7-6',
        'Novak Djokovic Rafael Nadal 2013 6-3 6-4','Tomas Berdych Gael Monfils 2015 6-1 6-4',
        'Novak Djokovic Rafael Nadal 2015 6-3 6-3']
collision_d = dict(zip(duplicates,[0]*len(duplicates)))

# connects the two dataframes on match keys and reformats columns fro w/l to p0/p1
cols = ['_name','_elo','_sf_elo','_elo_538','_sf_elo_538','_52_swon','_52_svpt','_52_rwon',\
        '_52_rpt','_sf_52_swon','_sf_52_svpt','_sf_52_rwon','_sf_52_rpt','_52_s_adj','_52_r_adj']
df = connect_df(match_df=atp_all_matches,pbp_df=pbp_matches,col_d=collision_d,player_cols=cols,\
                start_year=2000)
df['elo_diff'] = [df['p0_elo'][i] - df['p1_elo'][i] for i in xrange(len(df))]
df['sf_elo_diff'] = [df['p0_sf_elo'][i] - df['p1_sf_elo'][i] for i in xrange(len(df))]
df['elo_diff_538'] = [df['p0_elo_538'][i] - df['p1_elo_538'][i] for i in xrange(len(df))]
df['sf_elo_diff_538'] = [df['p0_sf_elo_538'][i] - df['p1_sf_elo_538'][i] for i in xrange(len(df))]

# dataframe with only matches that have pbp
if ONLY_PBP:
    df = df[df['pbp']!='None']
else:
    df = df[df['winner']!='None']
df = df.reset_index(drop=True)

cols = ['52_s_adj','52_r_adj']
test_df = generate_JS_stats(df,cols)

collision
Viktor Troicki Radek Stepanek 2012 2-6 6-4 6-3 5 5
collision
Fabio Fognini Tommy Haas 2013 6-2 6-4 7 7
10670
52_s_adj 0.0460380635894
52_r_adj 0.0436543168683


In [8]:
# keep relevant columns
df = df[['tny_id','tny_name','surface','tny_date','match_year','match_month',
         u'p0_name', u'p1_name', u'p0_elo',
         u'p1_elo', u'p0_sf_elo', u'p1_sf_elo', u'p0_elo_538', u'p1_elo_538',
         u'p0_sf_elo_538', u'p1_sf_elo_538', u'p0_52_swon',u'p0_52_svpt', 
         u'p1_52_swon', u'p1_52_svpt', u'p0_52_rwon', u'p0_52_rpt',
         u'p1_52_rwon', u'p1_52_rpt', 
         u'elo_diff', u'sf_elo_diff',
         u'elo_diff_538', u'sf_elo_diff_538',
         u'p0_s_pct', u'p0_r_pct', u'p1_s_pct', u'p1_r_pct', 
         u'p0_s_pct_JS', u'p1_s_pct_JS', u'p0_r_pct_JS', u'p1_r_pct_JS',
         u'p0_sf_52_swon', u'p0_sf_52_svpt',u'p1_sf_52_swon', u'p1_sf_52_svpt', 
         u'p0_sf_52_rwon', u'p0_sf_52_rpt', u'p1_sf_52_rwon', u'p1_sf_52_rpt',
         u'p0_sf_s_pct', u'p0_sf_r_pct', u'p1_sf_s_pct', u'p1_sf_r_pct', 
         u'p0_sf_s_pct_JS', u'p1_sf_s_pct_JS', u'p0_sf_r_pct_JS', u'p1_sf_r_pct_JS',
         u'p0_52_s_adj',u'p0_52_r_adj',u'p1_52_s_adj',u'p1_52_r_adj',
         u'p0_52_s_adj_JS',u'p0_52_r_adj_JS',u'p1_52_s_adj_JS',u'p1_52_r_adj_JS',
         u'avg_52_s', u'avg_52_r', u'sf_avg_52_s', u'sf_avg_52_r',
         'tny_stats','best_of','score','pbp','winner']]

# binary indicator for whether player 0 won
df['winner'] = [1-winner for winner in df['winner']]

# generate serving probabilities (w/out JS normalization) for Klaassen-Magnus model
df['match_id'] = range(len(df))
df['tny_stats'] = [df['avg_52_s'][i] if df['tny_stats'][i]==0 else df['tny_stats'][i] for i in xrange(len(df))]
df['p0_s_kls'] = df['tny_stats']+(df['p0_s_pct']-df['avg_52_s']) - (df['p1_r_pct']-df['avg_52_r'])
df['p1_s_kls'] = df['tny_stats']+(df['p1_s_pct']-df['avg_52_s']) - (df['p0_r_pct']-df['avg_52_r'])
df['p0_s_kls_JS'] = df['tny_stats']+(df['p0_s_pct_JS']-df['avg_52_s']) - (df['p1_r_pct_JS']-df['avg_52_r'])
df['p1_s_kls_JS'] = df['tny_stats']+(df['p1_s_pct_JS']-df['avg_52_s']) - (df['p0_r_pct_JS']-df['avg_52_r'])
df['p0_sf_s_kls'] = df['tny_stats']+(df['p0_sf_s_pct']-df['sf_avg_52_s']) - (df['p1_sf_r_pct']-df['sf_avg_52_r'])
df['p1_sf_s_kls'] = df['tny_stats']+(df['p1_sf_s_pct']-df['sf_avg_52_s']) - (df['p0_sf_r_pct']-df['sf_avg_52_r'])
df['p0_sf_s_kls_JS'] = df['tny_stats']+(df['p0_sf_s_pct_JS']-df['sf_avg_52_s']) - (df['p1_sf_r_pct_JS']-df['sf_avg_52_r'])
df['p1_sf_s_kls_JS'] = df['tny_stats']+(df['p1_sf_s_pct_JS']-df['sf_avg_52_s']) - (df['p0_sf_r_pct_JS']-df['sf_avg_52_r'])
df['p0_s_kls_adj'] = df['tny_stats']+(df['p0_52_s_adj']+df['avg_52_s']) - (df['p1_52_r_adj']+df['avg_52_r'])
df['p1_s_kls_adj'] = df['tny_stats']+(df['p1_52_s_adj']+df['avg_52_s']) - (df['p0_52_r_adj']+df['avg_52_r'])
df['p0_s_kls_adj_JS'] = df['tny_stats']+(df['p0_52_s_adj_JS']+df['avg_52_s']) - (df['p1_52_r_adj_JS']+df['avg_52_r'])
df['p1_s_kls_adj_JS'] = df['tny_stats']+(df['p1_52_s_adj_JS']+df['avg_52_s']) - (df['p0_52_r_adj_JS']+df['avg_52_r'])

# depending on ONLY_PBP, this will have point-by-point matches, or all
# tour-level matches from START_DATE to present
name = 'elo_pbp_with_surface_9_16' if ONLY_PBP else 'elo_atp_matches_21st_century_9_12'
print name + '.csv saved to my_data'
df.to_csv('../my_data/'+name+'.csv')

elo_pbp_with_surface_9_16.csv saved to my_data


In [5]:
df.shape

(51550, 82)

In [31]:
cols = [w + col for col in ['_elo_538','_sf_elo_538','_52_swon','_52_svpt','_52_rwon',\
        '_52_rpt','_sf_52_swon','_sf_52_svpt','_sf_52_rwon','_sf_52_rpt'] for w in ['w','l']]

In [32]:
adj_cols = [u'w_52_s_adj', u'w_52_r_adj', u'l_52_s_adj', u'l_52_r_adj']