In [1]:
import os
import sys
from datetime import datetime as dt
from collections import defaultdict
from helper_functions import *
from data_functions import *

pd.options.mode.chained_assignment = None
sys.path.insert(0, '{}/sackmann'.format(os.getcwd()))
START_YEAR = 2010
TOUR = 'atp'
RET_STRINGS = ('ABN','DEF','In Progress','RET','W/O',' RET',' W/O','nan','walkover')
ABD_STRINGS = ('abandoned','ABN','ABD','DEF','def','unfinished','Walkover')
DATE = dt.now().strftime(('%m/%d/%Y'))

print 'main'
print 'currently here: ', os.getcwd()
match_df = concat_data(1968, 2018, TOUR)
match_df = format_match_df(match_df,TOUR,ret_strings=RET_STRINGS,abd_strings=ABD_STRINGS)
start_ind = match_df[match_df['match_year']>=START_YEAR-1].index[0]

main
currently here:  /Users/Work/tennis_probs_live


In [93]:
'''
tracking object for common-opponent ratings
stores past year of performance against opponents
'''
class commop_stats_52():
    def __init__(self, date):
        self.last_year = defaultdict(lambda: np.zeros([12, 4]))
        self.most_recent = date;
        
    def time_diff(self, new_date, old_date):
        return 12*(new_date[0]-old_date[0])+(new_date[1]-old_date[1])

    # TODO: update data for every single opponent, just the one being played (otherwise data )
    def update_player_stats(self, match_date, opponent_name):
        diff = self.time_diff(match_date, self.most_recent)
        if diff>=12:
            self.last_year[opponent_name] = np.zeros([12,4])
        elif diff>0:
            self.last_year[opponent_name][diff:] = self.last_year[opponent_name][:12-diff]
            self.last_year[opponent_name][:diff] = 0
    
    def update_player_histories(self, match_date, opponent_name):
        for opp_name in np.union1d(opponent_name, self.last_year.keys()):
            self.update_player_stats(match_date, opp_name)

        self.most_recent = match_date

    def update(self, match_date, match_stats, opponent_name):
        self.update_player_histories(match_date, opponent_name)
        self.last_year[opponent_name][0] = self.last_year[opponent_name][0]+match_stats
    
'''
generate delta between two players relative to shared opponent
delta_i^AB = (spw(A, C_i) - (1 - rpw(A, C_i))) - (spw(B, C_i) - (1 - rpw(B, C_i)))
'''
def generate_delta(p1_stats, p2_stats):
    p1_s_pct, p1_r_pct = p1_stats[0]/float(p1_stats[1]), p1_stats[2]/float(p1_stats[3])
    p2_s_pct, p2_r_pct = p2_stats[0]/float(p2_stats[1]), p2_stats[2]/float(p2_stats[3])
    return (p1_s_pct - (1 - p1_r_pct)) - (p2_s_pct - (1 - p2_r_pct))

'''
return true if total service/return points both greater than zero
'''
def has_stats(last_year_stats):
    return last_year_stats[1] > 0 and last_year_stats[3] > 0

'''
get opponents who have played a match in the past 12 months (more than 0 points)
'''
def get_opponents(player_d, player_name):
    historical_opponents = player_d[player_name].last_year.keys()
    return [opp for opp in historical_opponents if has_stats(np.sum(player_d[player_name].last_year[opp], axis=0))]

'''
compute serve/return parameters, given their common opponent history
'''
def generate_commop_params(player_d, player1, player2):
    p1_opponents, p2_opponents = get_opponents(player_d, player1), get_opponents(player_d, player2)
    common_opponents = np.intersect1d(p1_opponents, p2_opponents)
    if len(common_opponents) == 0:
        return .6, .4
    
    match_deltas = np.zeros(len(common_opponents))
    for i, comm_op in enumerate(common_opponents):
        p1_match_stats = np.sum(player_d[player1].last_year[comm_op], axis=0)
        p2_match_stats = np.sum(player_d[player2].last_year[comm_op], axis=0)
        comm_op_delta = generate_delta(p1_match_stats, p2_match_stats)
        match_deltas[i] = comm_op_delta
        if np.isnan(comm_op_delta):
            print 'nan here: ', p1_match_stats, p2_match_stats, comm_op
        
    overall_delta = np.mean(match_deltas)
    if np.isnan(overall_delta):
        print 'nan, match_deltas: ', match_deltas
    return (.6 + overall_delta/2), (.4 + overall_delta/2)

'''
collect 12-month s/r common-opponent performance by player (TODO: get rid of start_ind as input and filter before
passing to this function)
'''
def generate_52_commop_stats(df, start_ind):
    player_d = {}
    start_date = (df['match_year'][start_ind], df['match_month'][start_ind])
    # array w/ 2x1 arrays for each player's 12-month serve/return performance
    match_52_stats = np.zeros([2,len(df), 2])
    
    w_l = ['w','l']
    for i, row in df.loc[start_ind:].iterrows():
        date = row['match_year'], row['match_month']

        for k, label in enumerate(w_l):
            opponent_name = row[w_l[1-k]+'_name']
            if row[label+'_name'] not in player_d:
                player_d[row[label+'_name']] = commop_stats_52(date)
            
            # can update player objs before calculating params since players cannot share
            # each other as common opponents
            if row[label+'_swon']==row[label+'_swon'] and row[label+'_svpt']==row[label+'_svpt']:    
                match_stats = (row[label+'_swon'],row[label+'_svpt'],row[w_l[1-k]+'_svpt']-\
                                row[w_l[1-k]+'_swon'],row[w_l[1-k]+'_svpt'])
                player_d[row[label+'_name']].update(date, match_stats, opponent_name)
        
        # can compute common-opponent stats after current match stats inputted
        # (since this won't affect common opponents)
        w_s_pct, w_r_pct = generate_commop_params(player_d, row['w_name'], row['l_name'])
        
        match_52_stats[0][i] = [w_s_pct, w_r_pct]
        match_52_stats[1][i] = [1 - w_r_pct, 1 - w_s_pct]
    
    for k,label in enumerate(w_l):
        df[label+'_52_commop_s_pct'] = match_52_stats[k][:,0]
        df[label+'_52_commop_r_pct'] = match_52_stats[k][:,1]

    return df

In [None]:
'''
'''

In [156]:
df = match_df
player_d = {}
start_date = (df['match_year'][start_ind], df['match_month'][start_ind])
# array w/ 2x1 arrays for each player's 12-month serve/return performance
match_52_stats = np.zeros([2,len(df), 2])

w_l = ['w','l']
for i, row in df.loc[start_ind:].iterrows():
    date = row['match_year'], row['match_month']

    for k, label in enumerate(w_l):
        opponent_name = row[w_l[1-k]+'_name']
        if row[label+'_name'] not in player_d:
            player_d[row[label+'_name']] = commop_stats_52(date)

        # can update player objs before calculating params since players cannot share
        # each other as common opponents
        if row[label+'_swon']==row[label+'_swon'] and row[label+'_svpt']==row[label+'_svpt']:    
            match_stats = (row[label+'_swon'],row[label+'_svpt'],row[w_l[1-k]+'_svpt']-\
                            row[w_l[1-k]+'_swon'],row[w_l[1-k]+'_svpt'])
            player_d[row[label+'_name']].update(date, match_stats, opponent_name)

    # can compute common-opponent stats after current match stats inputted
    # (since this won't affect common opponents)
    w_s_pct, w_r_pct = generate_commop_params(player_d, row['w_name'], row['l_name'])
    

    if np.isnan(w_s_pct):
        break

    match_52_stats[0][i] = [w_s_pct, w_r_pct]
    match_52_stats[1][i] = [1 - w_r_pct, 1 - w_s_pct]
    
    for k,label in enumerate(w_l):
        df[label+'_52_commop_s_pct'] = match_52_stats[k][:,0]
        df[label+'_52_commop_r_pct'] = match_52_stats[k][:,1]

In [157]:
df.loc[160000:]

Unnamed: 0,tny_id,tny_name,surface,draw_size,tny_date,match_num,w_name,l_name,score,best_of,...,w_swon,l_swon,w_rwon,l_rwon,w_rpt,l_rpt,w_52_commop_s_pct,w_52_commop_r_pct,l_52_commop_s_pct,l_52_commop_r_pct
160000,2017-M007,Miami Masters,Hard,128,2017-03-20,275,Jack Sock,Jiri Vesely,6-3 7-6,3,...,50.0,41.0,28.0,26.0,69.0,76.0,0.617791,0.417791,0.582209,0.382209
160001,2017-M007,Miami Masters,Hard,128,2017-03-20,276,Nicolas Mahut,Guido Pella,6-4 6-3,3,...,42.0,33.0,22.0,17.0,55.0,59.0,0.621122,0.421122,0.578878,0.378878
160002,2017-M007,Miami Masters,Hard,128,2017-03-20,277,Rafael Nadal,Philipp Kohlschreiber,0-6 6-2 6-3,3,...,43.0,43.0,30.0,16.0,73.0,59.0,0.610240,0.410240,0.589760,0.389760
160003,2017-M007,Miami Masters,Hard,128,2017-03-20,278,Adrian Mannarino,Borna Coric,6-4 2-6 7-6,3,...,58.0,63.0,43.0,34.0,106.0,92.0,0.554781,0.354781,0.645219,0.445219
160004,2017-M007,Miami Masters,Hard,128,2017-03-20,279,Tomas Berdych,Gilles Muller,6-3 6-4,3,...,42.0,38.0,23.0,13.0,61.0,55.0,0.574263,0.374263,0.625737,0.425737
160005,2017-M007,Miami Masters,Hard,128,2017-03-20,280,Roberto Bautista,Sam Querrey,3-6 6-2 6-3,3,...,46.0,42.0,41.0,34.0,83.0,80.0,0.610431,0.410431,0.589569,0.389569
160006,2017-M007,Miami Masters,Hard,128,2017-03-20,281,Roger Federer,Juan Martin,6-3 6-4,3,...,44.0,36.0,26.0,16.0,62.0,60.0,0.637119,0.437119,0.562881,0.362881
160007,2017-M007,Miami Masters,Hard,128,2017-03-20,282,David Goffin,Diego Sebastian,4-6 6-3 7-5,3,...,74.0,56.0,40.0,47.0,96.0,121.0,0.603738,0.403738,0.596262,0.396262
160008,2017-M007,Miami Masters,Hard,128,2017-03-20,283,Nick Kyrgios,Ivo Karlovic,6-4 6-7 7-6,3,...,79.0,81.0,37.0,16.0,118.0,95.0,0.649305,0.449305,0.550695,0.350695
160009,2017-M007,Miami Masters,Hard,128,2017-03-20,284,Alexander Zverev,John Isner,6-7 7-6 7-6,3,...,87.0,88.0,28.0,25.0,116.0,112.0,0.596479,0.396479,0.603521,0.403521


In [111]:
player1, player2 = row['w_name'], row['l_name']

In [113]:
p1_opponents, p2_opponents = get_opponents(player_d, player1), get_opponents(player_d, player2)
common_opponents = np.intersect1d(p1_opponents, p2_opponents)

In [118]:
common_opponents

array(['Andrey Golubev', 'Andy Murray', 'Juan Martin', 'Marin Cilic',
       'Nicolas Almagro', 'Radek Stepanek', 'Teymuraz Gabashvili',
       'Tomas Berdych'], dtype='|S19')

In [128]:
player_d['Ivan Ljubicic'].last_year['Teymuraz Gabashvili']

array([[40., 56.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [122]:
player1, player2

('Ivan Ljubicic', 'Florian Mayer')

In [123]:
# TODO: find match between gabashvili and ljubicic where serve data entered was screwy

In [131]:
for i, row in df.loc[start_ind:].iterrows():
    if row['w_name'] == 'Ivan Ljubicic' and row['l_name'] == 'Teymuraz Gabashvili':
        print i
    if row['l_name'] == 'Ivan Ljubicic' and row['w_name'] == 'Teymuraz Gabashvili':
        print i


137420
144574


In [135]:
df[df['l_swon'] == 0]

Unnamed: 0,tny_id,tny_name,surface,draw_size,tny_date,match_num,w_name,l_name,score,best_of,...,w_swon,l_swon,w_rwon,l_rwon,w_rpt,l_rpt,w_52_commop_s_pct,w_52_commop_r_pct,l_52_commop_s_pct,l_52_commop_r_pct
75816,1991-402,Memphis,Hard,48,1991-02-18,46,Michael Stich,Michael Chang,6-2 6-2,3,...,35.0,0.0,0.0,13.0,0.0,48.0,0.0,0.0,0.0,0.0
80278,1992-308,Munich,Clay,32,1992-04-27,6,Rodolphe Gilbert,Radomir Vasek,6-2 6-4,3,...,38.0,0.0,0.0,23.0,0.0,61.0,0.0,0.0,0.0,0.0
80653,1992-520,Roland Garros,Clay,128,1992-05-25,88,Michael Stich,Lionel Roux,6-1 6-4 6-4,5,...,56.0,0.0,0.0,15.0,0.0,71.0,0.0,0.0,0.0,0.0
81303,1992-319,Kitzbuhel,Clay,48,1992-07-20,4,Christian Saceanu,Julian Knowle,6-4 5-7 7-6,3,...,69.0,0.0,0.0,61.0,0.0,130.0,0.0,0.0,0.0,0.0
81418,1992-475,San Marino,Clay,32,1992-07-27,10,Ronald Agenor,Yevgeny Kafelnikov,6-1 6-2,3,...,39.0,0.0,0.0,20.0,0.0,59.0,0.0,0.0,0.0,0.0
81867,1992-323,Bordeaux,Clay,32,1992-09-14,6,Marcos Aurelio,Thierry Van,6-4 6-4,3,...,39.0,0.0,0.0,21.0,0.0,60.0,0.0,0.0,0.0,0.0
81908,1992-322,Cologne,Clay,32,1992-09-14,17,Thomas Muster,Juan Gisbert,4-6 7-6 7-6,3,...,73.0,0.0,0.0,39.0,0.0,112.0,0.0,0.0,0.0,0.0
82070,1992-367,Athens,Clay,32,1992-10-05,16,Francisco Clavet,Talito Corrales,6-1 6-2,3,...,39.0,0.0,0.0,26.0,0.0,65.0,0.0,0.0,0.0,0.0
82312,1992-457,Taipei,Carpet,32,1992-10-19,16,Kenneth Carlsen,David Adams,3-6 6-3 7-5,3,...,63.0,0.0,0.0,37.0,0.0,100.0,0.0,0.0,0.0,0.0
82557,1992-438,Moscow,Carpet,32,1992-11-09,17,Karel Novacek,Markus Zoecke,6-3 6-1,3,...,40.0,0.0,0.0,18.0,0.0,58.0,0.0,0.0,0.0,0.0


In [133]:
df.loc[144574]

tny_id                          2011-747
tny_name                         Beijing
surface                             Hard
draw_size                             32
tny_date                      2011-10-03
match_num                              9
w_name                     Ivan Ljubicic
l_name               Teymuraz Gabashvili
score                            7-5 7-5
best_of                                3
w_svpt                                56
l_svpt                                 0
is_gs                              False
match_year                          2011
match_month                           10
w_swon                                40
l_swon                                 0
w_rwon                                 0
l_rwon                                16
w_rpt                                  0
l_rpt                                 56
w_52_commop_s_pct                    NaN
w_52_commop_r_pct                    NaN
l_52_commop_s_pct                    NaN
l_52_commop_r_pc

In [53]:
w_s_pct, w_r_pct = generate_commop_params(player_d, row['w_name'], row['l_name'])

nan here:  [0. 0. 1. 2.] [43. 74. 41. 97.] Jordan Thompson
nan, match_deltas:  [-0.08332989 -0.01247386  0.00039021  0.03735034  0.0761457  -0.09021291
  0.00993624  0.00370821 -0.0551944   0.09571941  0.05181814 -0.02472418
  0.07675907  0.12684489 -0.05605812 -0.00352716  0.12346732 -0.09972306
 -0.00410253  0.21806341  0.16377006 -0.18446651  0.08823551 -0.12326006
 -0.25626487 -0.00698242 -0.09979252  0.16011968  0.00324511  0.0643906
  0.16426306 -0.05742801  0.31046373  0.08317408  0.01485162 -0.19306608
 -0.13083931  0.06109264  0.04300701  0.10170723 -0.01176189  0.00962053
 -0.02791416         nan -0.11688372 -0.09155797 -0.07983483  0.21649759
  0.2301672   0.05324866  0.11387033  0.13945053 -0.36163992 -0.08863724
 -0.0322549   0.15196391  0.09664568 -0.23017321  0.26216015  0.12991109
  0.06944132  0.10143045 -0.04979518  0.11737188  0.01623323 -0.15840729
  0.17033623  0.01262992  0.17547198 -0.00269644  0.3552986   0.01514376
 -0.04561464 -0.12740037  0.21226451 -0.054922

  


In [65]:
player_d[row['l_name']].player_most_recent['Daniel Evans']

(2009, 1)

In [56]:
p1_opponents, p2_opponents = players_stats[row['w_name']].last_year.keys(), players_stats[row['l_name']].last_year.keys()
common_opponents = np.intersect1d(p1_opponents, p2_opponents)

In [58]:
len(common_opponents)

96

In [14]:
generate_commop_params(players_stats, row['w_name'], row['l_name'])

nan, match_deltas:  [-0.17208825  0.12441893         nan]


  


(nan, nan)

In [None]:
player_d = {}
date = [2015, 4]
player_d['p1'] = commop_stats_52(date)
player_d['p1'].update(date, [55, 90, 30, 90], 'p3')
player_d['p2'] = commop_stats_52(date)
player_d['p2'].update(date, [60, 90, 35, 90], 'p3')
player_d['p3'] = commop_stats_52(date)
assert(generate_commop_params(player_d, 'p1', 'p2')) == (0.5444444444444445, 0.6555555555555554)
assert(generate_commop_params(player_d, 'p1', 'p3')) == (.6, .6)

In [None]:
# '''
# tracking object for common-opponent ratings
# stores past year of performance against opponents
# '''
# class commop_stats_52():
#     def __init__(self, date):
#         self.last_year = defaultdict(lambda: np.zeros([12, 4]))
#         self.player_most_recent = defaultdict(lambda: date);
        
#     def time_diff(self, new_date, old_date):
#         return 12*(new_date[0]-old_date[0])+(new_date[1]-old_date[1])

#     def set_month(self, match_date, opponent_name):
#         diff = self.time_diff(match_date, self.player_most_recent[opponent_name])
#         if diff>=12:
#             self.last_year = np.zeros([12,4])
#         elif diff>0:
#             self.last_year[opponent_name][diff:] = self.last_year[opponent_name][:12-diff]
#             self.last_year[opponent_name][:diff] = 0
#         self.player_most_recent[opponent_name] = match_date

#     def update(self, match_date, match_stats, opponent_name):
#         self.set_month(match_date, opponent_name)
#         self.last_year[opponent_name][0] = self.last_year[opponent_name][0]+match_stats