In [2]:
# NEXT STEPS: assemble players win percentages from any set score (how well they do from one
# set up, one down, came back to split, gave up to split) throughout career; can represent 
# this as a beta distribution
# with prior as the average player of that current elo's probability with strength 5-10

# TO DO: assemble running tallies to track serve percentages over the past 52 weeks, specifically for years 2010-current
#        also, compare and contrast the "old" and "new" files

# 7/26 update: seem to have fixed some glitches in my previous adaptation of 538's elo method
# everything is very simple in elo_538.py; also fixed the match order within tournaments in atp_all_matches

# note: we improved from 9563 to 10000 matches with string manipulations
# my elo scores seem close to Jeff Sackman's when I set k=32 as constant; when using the 538 style of "cooling" k-factor,
# those elos are going to be inflated in comparison (but keep in mind, that's only different across methods)
AO = 'Australian Open'
USO = 'US Open'
COUNT = False

In [3]:
from helper_functions import *
import pandas as pd
import numpy as np
import datetime
import math
import elo_538 as elo
import re
import copy

atp_year_list = []
for i in xrange(1968,2018):
    atp_year_list.append(pd.read_csv("../tennis_data/atp/atp_matches_{0}.csv".format(i)))
atp_all_matches = pd.concat(atp_year_list, ignore_index = True)

# Set up the pbp (point by point) column that will contain the pbp information for matches we have info on
atp_all_matches['pbp'] = [None]*len(atp_all_matches)
atp_all_matches = atp_all_matches.rename(columns={'winner_name':'w_name','loser_name':'l_name'})
atp_all_matches['w_name'] = [normalize_name(x) for x in atp_all_matches['w_name']]
atp_all_matches['l_name'] = [normalize_name(x) for x in atp_all_matches['l_name']]
atp_all_matches['tourney_name'] = ['Davis Cup' if 'Davis Cup' in s else s for s in atp_all_matches['tourney_name']]
atp_all_matches['tourney_name'] = [s.replace('Australian Chps.',AO).replace('Australian Open-2',AO).replace('U.S. National Chps.',USO) \
                                   for s in atp_all_matches['tourney_name']]
grand_slam_d = dict(zip(['Australian Open','Roland Garros','Wimbledon','US Open'],[1]*4))
atp_all_matches['is_gs'] = [name in grand_slam_d for name in atp_all_matches['tourney_name']]

# Get dates into the same format
atp_all_matches['tourney_date'] = [datetime.datetime.strptime(str(x), "%Y%m%d").date() for x in atp_all_matches['tourney_date']]
atp_all_matches['match_year'] = [x.year for x in atp_all_matches['tourney_date']]
atp_all_matches['match_month'] = [x.month for x in atp_all_matches['tourney_date']]
atp_all_matches['score'] = [re.sub(r"[\(\[].*?[\)\]]", "", str(s)) for s in atp_all_matches['score']] # str(s) fixes any nans
atp_all_matches['score'] = ['RET' if 'RET' in s else s for s in atp_all_matches['score']]
atp_all_matches['w_swon'] = [atp_all_matches['w_1stWon'][i]+atp_all_matches['w_2ndWon'][i] for i in xrange(len(atp_all_matches))]
atp_all_matches['l_swon'] = [atp_all_matches['l_1stWon'][i]+atp_all_matches['l_2ndWon'][i] for i in xrange(len(atp_all_matches))]
atp_all_matches['w_rwon'] = atp_all_matches['l_svpt'] - atp_all_matches['l_swon']
atp_all_matches['l_rwon'] = atp_all_matches['w_svpt'] - atp_all_matches['w_swon']
atp_all_matches['w_rpt'] = atp_all_matches['l_svpt']
atp_all_matches['l_rpt'] = atp_all_matches['w_svpt']

# get rid of leading 0s in tourney_id
atp_all_matches['tourney_id'] = ['-'.join(t.split('-')[:-1] + [t.split('-')[-1][1:]]) if t.split('-')[-1][0]=='0' else t \
                                for t in atp_all_matches['tourney_id']]

# get rid of matches involving a retirement
atp_all_matches['score'] = ['ABN' if score.split(' ')[-1] in ('abandoned','ABN','ABD','DEF','def','unfinished','Walkover') \
                            else score for score in atp_all_matches['score']]
ret_strings = ['ABN','DEF','In Progress','RET','W/O',' RET',' W/O','nan','walkover']
ret_d = dict(zip(ret_strings,[1]*len(ret_strings)))
atp_all_matches = atp_all_matches.loc[[i for i in range(len(atp_all_matches)) if atp_all_matches['score'][i] not in ret_d]]
atp_all_matches = atp_all_matches.sort_values(by=['tourney_date','tourney_name','match_num'], ascending=True).reset_index()
del atp_all_matches['index']
atp_all_matches = atp_all_matches[['tourney_id','tourney_name','tourney_date','match_num','surface','is_gs','w_name','l_name',
                'winner_id','loser_id','best_of','match_year','match_month','score','w_swon','w_svpt','l_swon','l_svpt']]
print atp_all_matches.shape
atp_all_matches.head(2)

(159505, 18)


Unnamed: 0,tourney_id,tourney_name,tourney_date,match_num,surface,is_gs,w_name,l_name,winner_id,loser_id,best_of,match_year,match_month,score,w_swon,w_svpt,l_swon,l_svpt
0,1968-580,Australian Open,1968-01-19,1,Grass,True,Richard Coulthard,Max Senior,110023,107760,5,1968,1,12-10 7-5 4-6 7-5,,,,
1,1968-580,Australian Open,1968-01-19,2,Grass,True,John Brown,Ernie Mccabe,109803,106964,5,1968,1,6-3 6-2 6-4,,,,


In [4]:
# find the overall serve/return percentages; also by tournament
sv_pct = np.sum(atp_all_matches['w_swon']+atp_all_matches['l_swon'])/np.sum(atp_all_matches['w_svpt']+atp_all_matches['l_svpt'])
print 'avg serve pct: ', sv_pct
print 'avg return pct: ', 1-sv_pct

avg serve pct:  0.624189285218
avg return pct:  0.375810714782


## Rating Systems

1. http://gobase.org/studying/articles/elo/
2. http://www.gamefaqs.com/boards/610657-dota-2/67994646
3. https://github.com/sublee/elo/blob/master/elo.py
4. http://stephenwan.net/thoughts/2012/10/02/elo-rating-system.html
5. https://deltadata.wordpress.com/2014/01/11/glicko-2-for-tennis-part-2-the-model/comment-page-1/#comment-192
6. https://github.com/sublee/glicko2/blob/master/glicko2.py
7. http://trueskill.org/
8. https://pypi.python.org/pypi/trueskill


In [5]:
### Make a dict of players with elo ratings ###
# now, adding three more surface-specific dictionaries for clay, grass, hard
players_list = np.union1d(atp_all_matches.w_name, atp_all_matches.l_name)
players_elo = dict(zip(list(players_list), [elo.Rating() for __ in range(len(players_list))]))
surface_elo = {}
for surface in ('Hard','Clay','Grass'):
    surface_elo[surface] = dict(zip(list(players_list), [elo.Rating() for __ in range(len(players_list))])) 

elo_1s, elo_2s = [],[]
surface_elo_1s, surface_elo_2s = [],[]
elo_obj = elo.Elo_Rater()

# update player elo from every recorded atp match
for i, row in atp_all_matches.iterrows():
    surface = row['surface']; is_gs = row['is_gs']
    # append elos, rate, update
    w_elo,l_elo = players_elo[row['w_name']],players_elo[row['l_name']]
    elo_1s.append(w_elo.value);elo_2s.append(l_elo.value)    
    elo_obj.rate_1vs1(w_elo,l_elo,is_gs,counts=COUNT)

    
    surface_elo_1s.append(surface_elo[surface][row['w_name']].value if surface in ('Hard','Clay','Grass') else w_elo.value)
    surface_elo_2s.append(surface_elo[surface][row['l_name']].value if surface in ('Hard','Clay','Grass') else l_elo.value)
    if surface in ('Hard','Clay','Grass'):
        new_elo1, new_elo2 = elo_obj.rate_1vs1(surface_elo[surface][row['w_name']],surface_elo[surface][row['l_name']],is_gs,counts=COUNT)

# add columns
atp_all_matches['w_elo'], atp_all_matches['l_elo'] = elo_1s, elo_2s
atp_all_matches['w_s_elo'], atp_all_matches['l_s_elo'] = surface_elo_1s, surface_elo_2s

In [10]:
# track player match stats for every match since 2009 (we only need these for pbp matches)
start_ind = 136146; atp_l = len(atp_all_matches)
players_stats = {}
# an array containing 2x1 arrays for winner and loser's previous 12-month serve performance over all matches in df
match_52_stats = np.zeros([2,atp_l,4])
w_l = ['w','l']
for i, row in atp_all_matches.loc[start_ind:].iterrows():    
    date = row['match_year'],row['match_month']           
    for k,label in enumerate(w_l):
        if row[label+'_name'] not in players_stats:
            players_stats[row[label+'_name']] = stats_52(date)
        # store serving stats prior to match
        match_52_stats[k][i] = np.sum(players_stats[row[label+'_name']].last_year,axis=0)
        # update serving stats if no
        if row[label+'_swon']==row[label+'_swon'] and row[label+'_svpt']==row[label+'_svpt']:    
            match_stats = (row[label+'_swon'],row[label+'_svpt'],row[w_l[1-k]+'_svpt']-row[w_l[1-k]+'_swon'],row[w_l[1-k]+'_svpt'])
            players_stats[row[label+'_name']].update(date,match_stats)
        match_52_stats[k][i] = np.sum(players_stats[row[label+'_name']].last_year,axis=0)

for k,label in enumerate(w_l):
    atp_all_matches[label+'_52_swon'] = match_52_stats[k][:,0]
    atp_all_matches[label+'_52_svpt'] = match_52_stats[k][:,1]
    atp_all_matches[label+'_52_rwon'] = match_52_stats[k][:,2]
    atp_all_matches[label+'_52_rpt'] = match_52_stats[k][:,3]

In [11]:
tourney_stats = {}
tourney_52_stats = np.zeros(atp_l)
for i, row in atp_all_matches.loc[start_ind:].iterrows():
    if row['tourney_name']=='Davis Cup' or row['w_swon']!=row['w_swon']:
        continue
    
    year,t_id = row['tourney_id'].split('-')
    year = int(year)
    match_stats = (row['w_swon']+row['l_swon'],row['w_svpt']+row['l_svpt'])
    if row['w_swon']!=row['w_swon']:
        tourney_52_stats[i] = 0 if tourney_stats[t_id].tourney_stats[1][1]==0 else \
                            tourney_stats[t_id].tourney_stats[1][0]/float(self.tourney_stats[1][1])
        
    elif t_id in tourney_stats:
        tourney_52_stats[i] = tourney_stats[t_id].update(year,match_stats)
    else:
        tourney_stats[t_id] = tourney_52(year)
        tourney_52_stats[i] = tourney_stats[t_id].update(year,match_stats)
    
atp_all_matches['tourney_stats'] = tourney_52_stats

In [8]:
atp_all_matches

Unnamed: 0,tourney_id,tourney_name,tourney_date,match_num,surface,is_gs,w_name,l_name,winner_id,loser_id,...,l_swon,l_svpt,w_elo,l_elo,w_s_elo,l_s_elo,w_52_swon,w_52_svpt,w_52_rwon,w_52_rpt
0,1968-580,Australian Open,1968-01-19,1,Grass,True,Richard Coulthard,Max Senior,110023,107760,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
1,1968-580,Australian Open,1968-01-19,2,Grass,True,John Brown,Ernie Mccabe,109803,106964,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
2,1968-580,Australian Open,1968-01-19,3,Grass,True,Ross Case,Gondo Widjojo,100257,110024,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
3,1968-580,Australian Open,1968-01-19,4,Grass,True,Allan Stone,Robert Layton,100105,110025,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
4,1968-580,Australian Open,1968-01-19,5,Grass,True,Warren Jacques,Bert Kearney,109966,110026,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
5,1968-580,Australian Open,1968-01-19,6,Grass,True,Max Pettman,Takesji Tsujimoto,107759,110027,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
6,1968-580,Australian Open,1968-01-19,7,Grass,True,Mike Belkin,M Marchment,100101,110028,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
7,1968-580,Australian Open,1968-01-19,8,Grass,True,Barry Phillips,Tony Dawson,100025,108430,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
8,1968-580,Australian Open,1968-01-19,9,Grass,True,William Coghlan,Peter Oatey,108519,110029,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0
9,1968-580,Australian Open,1968-01-19,10,Grass,True,Geoff Pollard,Christian Janssens,109799,110030,...,,,1500.000000,1500.000000,1500.000000,1500.000000,0.0,0.0,0.0,0.0


In [13]:
rankings = sorted(zip(players_elo.keys(),players_elo.values()),key=lambda x: x[1].value,reverse=True)
#rankings

In [14]:
# Combine all the matches that have pbp (point by point) information into one dataframe
# and clean up columns in preparation for merging with all_atp_matches
pbp_matches_archive = pd.read_csv("../tennis_data/pbp_matches_atp_main_archive.csv")
pbp_matches_archive_old = pd.read_csv("../tennis_data/pbp_matches_atp_main_archive_old.csv")
pbp_matches_current = pd.read_csv("../tennis_data/pbp_matches_atp_main_current.csv")
pbp_matches = pd.concat([pbp_matches_archive_old.loc[:932],pbp_matches_archive,pbp_matches_current])
pbp_matches.winner = pbp_matches.winner - 1
pbp_matches = pbp_matches.reset_index(); del pbp_matches['index']
pbp_matches['w_name'] = np.where(pbp_matches['winner'] == 0, pbp_matches['server1'], pbp_matches['server2'])
pbp_matches['l_name'] = np.where(pbp_matches['winner'] == 0, pbp_matches['server2'], pbp_matches['server1'])
pbp_matches['w_name'] = [normalize_name(x) for x in pbp_matches['w_name']]
pbp_matches['l_name'] = [normalize_name(x) for x in pbp_matches['l_name']]
pbp_matches['date'] = pd.to_datetime(pbp_matches['date'])
pbp_matches['match_year'] = [x.year for x in pbp_matches['date']]
pbp_matches['match_month'] = [x.month for x in pbp_matches['date']]
pbp_matches['date'] = [x.date() for x in pbp_matches['date']]
pbp_matches['score'] = [re.sub(r"[\(\[].*?[\)\]]", "", s) for s in pbp_matches['score']]

print pbp_matches.shape
print len(set(pbp_matches['pbp'].values))

(12086, 16)
11877


In [15]:
# Now, just make a dictionary with each key as 'w_name'+'l_name'+'match_year'+'score' to connect pbp 
# strings to atp_all_matches
# would add match_month but this loses 800-900 matches
# replacing dashes with spaces accounted for over 1000 more pbp matches
# other discrepancies due to tiebreak scores, or matches in pbp being qualies, therefore
# not tour-level matches
# MUST REMOVE PARENTHESES TERMS FROM TIE-BREAKS IN SCORES

collision_d = dict(zip(['Janko Tipsarevic Kei Nishikori 2011 6-4 6-4','Robin Soderling Michael Berrer 2011 6-3 7-6',
                      'Juan Martin Kevin Anderson 2011 6-4 6-4','Philipp Kohlschreiber Mikhail Youzhny 2011 6-4 6-2',
                      'Philipp Kohlschreiber Olivier Rochus 2012 6-1 6-4','Viktor Troicki Radek Stepanek 2012 2-6 6-4 6-3',
                      'Gilles Simon Grigor Dimitrov 2012 6-3 6-3','Alexandr Dolgopolov Gilles Simon 2012 6-3 6-4',
                      'Fabio Fognini Tommy Haas 2013 6-2 6-4','Richard Gasquet Florian Mayer 2013 6-3 7-6',
                      'Novak Djokovic Rafael Nadal 2013 6-3 6-4','Tomas Berdych Gael Monfils 2015 6-1 6-4',
                      'Novak Djokovic Rafael Nadal 2015 6-3 6-3'],[0]*13))
pbp_dict = {}; winner_dict = {}
for i in xrange(len(pbp_matches)):
    key = pbp_matches['w_name'][i] +' ' +  pbp_matches['l_name'][i] + ' ' \
        + str(pbp_matches['match_year'][i]) + ' ' + pbp_matches['score'][i]
    key = key+' '+str(pbp_matches['match_month'][i]) if key in collision_d else key
    if key in pbp_dict:
        continue
    pbp_dict[key] = pbp_matches['pbp'][i]
    winner_dict[key] = pbp_matches['winner'][i]

In [16]:
# in case of a collision (about 10 cases), I only take the first match with that key
c = 0
pbps = []
winners = []
info = {}

for i in xrange(len(atp_all_matches)):
    key = atp_all_matches['w_name'][i] +' ' +  atp_all_matches['l_name'][i] + ' ' \
        +str(atp_all_matches['match_year'][i])+' '+atp_all_matches['score'][i]
    key = key+' '+str(atp_all_matches['match_month'][i]) if key in collision_d else key
    if key in pbp_dict:
        c += 1
        pbps.append(pbp_dict[key])
        winners.append(winner_dict[key])
        if key in info:
            pbps[-1] = 'NA'; winners[-1] = 'NA'
            print 'collision'; print key + ' ' + str(atp_all_matches['match_month'][i])
        info[key] = 1
    else:
        pbps.append('NA')
        winners.append('NA')
print c
atp_all_matches['pbp'] = pbps
atp_all_matches['winner'] = winners

collision
Viktor Troicki Radek Stepanek 2012 2-6 6-4 6-3 5 5
collision
Fabio Fognini Tommy Haas 2013 6-2 6-4 7 7
10670


In [80]:
# #demo of match-keys I created 
# a lot of the inconsistencies are challenger-level events or even exhibition matches (see 2017)
# for i in xrange(len(pbp_matches)):
#     key = pbp_matches['w_name'][i] +' ' +  pbp_matches['l_name'][i] + ' ' \
#         + str(pbp_matches['match_year'][i])+ ' ' + pbp_matches['score'][i]
#     if key not in info:
#         print i,key

In [17]:
df = atp_all_matches[atp_all_matches['pbp']!='NA']
cols = df.columns.drop(['loser_id','winner_id'])
df = df[cols]
df = df.reset_index()
del df['index']

# SET UP LOOP TO CHANGE W,L TO P0,P1
for col in ['_name','_elo','_s_elo','_52_swon','_52_svpt','_52_rwon','_52_rpt']:
    df['p0'+col] = [df['l'+col][i] if df['winner'][i] else df['w'+col][i] for i in xrange(len(df))]
    df['p1'+col] = [df['w'+col][i] if df['winner'][i] else df['l'+col][i] for i in xrange(len(df))]

df['elo_diff'] = [df['p0_elo'][i] - df['p1_elo'][i] for i in xrange(len(df))]
df['s_elo_diff'] = [df['p0_s_elo'][i] - df['p1_s_elo'][i] for i in xrange(len(df))]
df['tourney_name'] = [s if s==s else 'Davis Cup' for s in df['tourney_name']]

In [86]:
#### James-Stein estimators for 52-week serve and return percentages ####
# calculate B_i coefficients for each player in terms of service points
p_hat = np.sum([df['p0_52_swon'],df['p1_52_swon']])/np.sum([df['p0_52_svpt'],df['p1_52_svpt']])
for label in ['p0','p1']:
    df[label+'_s_pct'] = [p_hat if x==0 else x for x in np.nan_to_num(df[label+'_52_swon']/df[label+'_52_svpt'])]
    df[label+'_r_pct'] = [1-p_hat if x==0 else x for x in np.nan_to_num(df[label+'_52_rwon']/df[label+'_52_rpt'])]

# fix this name later with find and replace...
s_history = np.concatenate([df['p0_52_swon']/df['p0_52_svpt'],df['p1_52_swon']/df['p1_52_svpt']],axis=0)
n = len(s_history)/2
group_var = np.var(s_history)
sigma2_i = np.nan_to_num(p_hat*(1-p_hat)/np.concatenate([df['p0_52_svpt'],df['p1_52_svpt']]))
tau2_hat = np.nanvar(s_history)
B_i = sigma2_i/(tau2_hat+sigma2_i)
df['B_i0_sv'],df['B_i1_sv'] = B_i[:n],B_i[n:]

s_history[s_history!=s_history] = p_hat
group_var = np.var(s_history)
df['p0_s_pct_JS'] = df['p0_s_pct']+df['B_i0_sv']*(p_hat-df['p0_s_pct'])
df['p1_s_pct_JS'] = df['p1_s_pct']+df['B_i1_sv']*(p_hat-df['p1_s_pct'])

# repeat for return averages (slightly different tau^2 value)
r_history = np.concatenate([df['p0_52_rwon']/df['p0_52_rpt'],df['p1_52_rwon']/df['p1_52_rpt']],axis=0)
sigma2_i = np.nan_to_num((1-p_hat)*p_hat/np.concatenate([df['p0_52_rpt'],df['p1_52_rpt']]))
tau2_hat = np.nanvar(r_history)
B_i = sigma2_i/(tau2_hat+sigma2_i)
df['B_i0_r'],df['B_i1_r'] = B_i[:n],B_i[n:]

r_history[r_history!=r_history] = 1-p_hat
df['p0_r_pct_JS'] = r_history[:n]+df['B_i0_r']*((1-p_hat)-r_history[:n])
df['p1_r_pct_JS'] = r_history[n:]+df['B_i1_r']*((1-p_hat)-r_history[n:])



In [98]:
# TO DO: check that these are correct...something is WRONG with the betas and sigma2s...
print p_hat,1-p_hat
df[['p0_52_swon','p0_52_svpt','p0_s_pct','p0_s_pct_JS','p1_52_swon','p1_52_svpt','p1_s_pct','p1_s_pct_JS', \
    'p0_52_rwon','p0_52_rpt','p0_r_pct','p0_r_pct_JS','p1_52_rwon','p1_52_rpt','p1_r_pct','p1_r_pct_JS']].head(2)

0.646775015628 0.353224984372


Unnamed: 0,p0_52_swon,p0_52_svpt,p0_s_pct,p0_s_pct_JS,p1_52_swon,p1_52_svpt,p1_s_pct,p1_s_pct_JS,p0_52_rwon,p0_52_rpt,p0_r_pct,p0_r_pct_JS,p1_52_rwon,p1_52_rpt,p1_r_pct,p1_r_pct_JS
0,63.0,92.0,0.684783,0.661407,1470.0,2350.0,0.625532,0.626782,38.0,116.0,0.327586,0.343856,750.0,2182.0,0.343721,0.344525
1,635.0,994.0,0.638833,0.639856,2849.0,4295.0,0.663329,0.662782,330.0,933.0,0.353698,0.353614,1624.0,4348.0,0.373505,0.372607


In [97]:
df = df[['tourney_id','tourney_name','tourney_date','p0_name','p1_name','p0_elo','p1_elo','elo_diff','p0_s_elo',\
         'p1_s_elo','s_elo_diff','p0_52_swon','p0_52_svpt','p0_s_pct','p0_s_pct_JS','p1_52_swon','p1_52_svpt', \
         'p1_s_pct','p1_s_pct_JS','p0_52_rwon','p0_52_rpt','p0_r_pct','p0_r_pct_JS','p1_52_rwon','p1_52_rpt', \
         'p1_r_pct','p1_r_pct_JS',     'tourney_stats','best_of','score','pbp','winner']]
df['winner'] = 1 - df['winner']
df.to_csv('../my_data/elo_pbp_with_surface_8_22.csv')

In [21]:
# tournament serving averages at the Australian Open
for key in sorted(tourney_stats['580'].historical_avgs.keys()):
    print key,': ',tourney_stats['580'].historical_avgs[key][0]/tourney_stats['580'].historical_avgs[key][1]

2009 :  0.626481155875
2010 :  0.622760488149
2011 :  0.623074885534
2012 :  0.61532346595
2013 :  0.635342621552
2014 :  0.636493282294
2015 :  0.64296875
2016 :  0.643896664578
2017 :  0.637466018028


In [33]:
# an example of a qualifying match in pbp_matches that will not be exported in elo_pbp.csv
z = pbp_matches[pbp_matches['server2']=='Jurgen Zopp']
z[z['server1']=='Jack Sock']

Unnamed: 0,pbp_id,date,tny_name,tour,draw,server1,server2,winner,pbp,score,adf_flag,wh_minutes,w_name,l_name,match_year,match_month
2492,3362769,2012-08-18,WinstonSalemOpen-ATPWinstonSalem,ATP,Main,Jack Sock,Jurgen Zopp,0,SRSSS;SSSS;RSSSS;SSSS;SRRSSS;SSSS;SSSRS;RSRSSR...,6-4 7-6,0,116,Jack Sock,Jurgen Zopp,2012,8


In [21]:
df.shape

(10675, 15)

In [98]:
below = df[df['elo_diff']<0]
above = df[df['elo_diff']>=0]

print (len(below[below['winner']==1])+len(above[above['winner']==0]))/float(len(df))

below = df[df['s_elo_diff']<0]
above = df[df['s_elo_diff']>=0]

print (len(below[below['winner']==1])+len(above[above['winner']==0]))/float(len(df))

0.696569178853
0.691038620172


In [None]:
# # accept dates in (year,month); last_year contains last 12 month stats, most recent to least
# # each month has a 4x1 row, containing 2x1 for serve and 2x1 for return
# class stats_52():
#     def __init__(self,date):
#         self.most_recent = date
#         self.last_year = np.zeros([12,4])
        
#     def time_diff(self,new_date,old_date):
#         return 12*(new_date[0]-old_date[0])+(new_date[1]-old_date[1])
    
#     def update(self,match_date,match_stats):
#         diff = self.time_diff(match_date,self.most_recent)
#         if diff>=12:
#             self.last_year = np.zeros([12,4])
#         elif diff>0:
#             self.last_year[diff:] = self.last_year[:12-diff]; self.last_year[:diff] = 0
#         self.last_year[0] = self.last_year[0]+match_stats
#         self.most_recent = match_date