In [14]:
import pandas as pd
import tqdm
import datetime
import numpy as np
from dateutil.relativedelta import relativedelta

In [3]:
df = pd.DataFrame()

for i in range(2001,2020):
    filename = f'atp_matches_{i}.csv'
    curr = pd.read_csv(filename)
    df = pd.concat([df, curr])

df = df.reset_index()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [4]:
print(df.shape)
print(df.columns)

(58192, 50)
Index(['index', 'best_of', 'draw_size', 'l_1stIn', 'l_1stWon', 'l_2ndWon',
       'l_SvGms', 'l_ace', 'l_bpFaced', 'l_bpSaved', 'l_df', 'l_svpt',
       'loser_age', 'loser_entry', 'loser_hand', 'loser_ht', 'loser_id',
       'loser_ioc', 'loser_name', 'loser_rank', 'loser_rank_points',
       'loser_seed', 'match_num', 'minutes', 'round', 'score', 'surface',
       'tourney_date', 'tourney_id', 'tourney_level', 'tourney_name',
       'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_ace', 'w_bpFaced',
       'w_bpSaved', 'w_df', 'w_svpt', 'winner_age', 'winner_entry',
       'winner_hand', 'winner_ht', 'winner_id', 'winner_ioc', 'winner_name',
       'winner_rank', 'winner_rank_points', 'winner_seed'],
      dtype='object')


Preprocessing
Keep relevant features
Clean and categorize data


In [5]:
print(df.isna().sum())

# Several matches do not have rankings for players.  I will assume these players were amateurs given wildcards or 
# or other low ranked players and assign a rank of 1000

index                     0
best_of                   0
draw_size             55411
l_1stIn                5353
l_1stWon               5353
l_2ndWon               5353
l_SvGms                5353
l_ace                  5353
l_bpFaced              5353
l_bpSaved              5353
l_df                   5353
l_svpt                 5353
loser_age                31
loser_entry           46747
loser_hand               46
loser_ht               7981
loser_id                  0
loser_ioc                 0
loser_name                0
loser_rank             1214
loser_rank_points      1214
loser_seed            44894
match_num                 0
minutes                6656
round                     0
score                     1
surface                 118
tourney_date              0
tourney_id                0
tourney_level             0
tourney_name              0
w_1stIn                5353
w_1stWon               5353
w_2ndWon               5353
w_SvGms                5353
w_ace               

In [6]:
#handle missing values for player specific data
df['loser_age'].fillna((df['loser_age'].mean()), inplace=True)
df['loser_ht'].fillna((df['loser_ht'].mean()), inplace=True)
df['loser_hand'].fillna('R', inplace=True)
df['loser_entry'].fillna('REG', inplace=True)
df['loser_rank'].fillna(1000, inplace=True)

df['winner_age'].fillna((df['loser_age'].mean()), inplace=True)
df['winner_ht'].fillna((df['winner_ht'].mean()), inplace=True)
df['winner_hand'].fillna('R', inplace=True)
df['winner_entry'].fillna('REG', inplace=True)
df['winner_rank'].fillna(1000, inplace=True)

In [7]:
#remove retirement and walkover matches for more accurate data
df = df[df['score'] != 'W/O']
df = df[~df.score.str.contains("RET", na=False)]
#Remove Davis Cup matches due to lack of point level data
df = df[df['tourney_level'] != 'D']

In [8]:
drop_cols = ['winner_rank_points', 'winner_seed', 'minutes', 'draw_size', 'loser_rank_points', 'loser_seed']
df = df.drop(drop_cols, axis = 1)
df = df.dropna()

In [9]:
df['tourney_date'] = df['tourney_date'].astype(str)
df['tourney_date'] = pd.to_datetime(df['tourney_date'])

In [112]:
#add serve stats
# df['w_first_serve_percentage'] = df['w_1stIn'] / df['w_svpt']
# df['w_ace_percentage'] = df['w_ace'] / df['w_svpt']
# df['w_break_point_defend_rate'] = df['w_bpSaved'] / df['w_bpFaced']
# df['w_total_serve_pt_win_percentage'] = (df['w_1stWon'] + df['w_2ndWon']) / df['w_svpt']

# df['l_first_serve_percentage'] = df['l_1stIn'] / df['l_svpt']
# df['l_ace_percentage'] = df['l_ace'] / df['l_svpt']
# df['l_break_point_defend_rate'] = df['l_bpSaved'] / df['l_bpFaced']
# df['l_total_serve_pt_win_percentage'] = (df['l_1stWon'] + df['l_2ndWon']) / df['l_svpt']

In [114]:
#add return stats
# df['w_break_points_conversion_percentage'] = (df['l_bpFaced'] - df['l_bpSaved']) / df['l_bpFaced']
# df['w_return_points_won_percentage'] = (df['l_svpt'] - (df['l_1stWon'] + df['l_2ndWon']))/ df['l_svpt']

# df['l_break_points_conversion_percentage'] = (df['w_bpFaced'] - df['w_bpSaved']) / df['w_bpFaced']
# df['l_return_points_won_percentage'] = (df['l_svpt'] - (df['w_1stWon'] + df['w_2ndWon']))/ df['w_svpt']

In [10]:
df.shape

(50395, 44)

In [11]:
df.describe()

Unnamed: 0,index,best_of,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_ace,l_bpFaced,l_bpSaved,l_df,...,w_SvGms,w_ace,w_bpFaced,w_bpSaved,w_df,w_svpt,winner_age,winner_ht,winner_id,winner_rank
count,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,...,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0
mean,1550.303205,3.369362,48.767775,32.594444,15.171584,12.405973,5.170751,8.653517,4.788729,3.384383,...,12.613235,7.006092,5.052962,3.475801,2.657049,78.553309,26.481146,185.862473,104869.506876,58.672468
std,889.305383,0.776085,18.789162,14.1213,7.163326,4.105031,4.899582,4.089673,3.244931,2.50373,...,4.092058,5.571386,3.99511,3.05621,2.263673,28.416857,3.832609,6.748714,6370.736413,74.94373
min,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,15.82,163.0,100644.0,1.0
25%,798.0,3.0,35.0,22.0,10.0,9.0,2.0,6.0,2.0,2.0,...,10.0,3.0,2.0,1.0,1.0,57.0,23.64,183.0,103484.0,16.0
50%,1557.0,3.0,45.0,30.0,14.0,11.0,4.0,8.0,4.0,3.0,...,11.0,6.0,4.0,3.0,2.0,73.0,26.35,185.0,104198.0,41.0
75%,2291.0,3.0,59.0,40.0,19.0,15.0,7.0,11.0,7.0,5.0,...,15.0,9.0,7.0,5.0,4.0,94.0,29.12,190.0,104918.0,76.5
max,3298.0,5.0,328.0,284.0,101.0,91.0,103.0,34.0,25.0,23.0,...,90.0,113.0,30.0,24.0,26.0,491.0,40.62423,208.0,206173.0,1890.0


In [103]:
df[df['l_svpt'] != 489].describe()

Unnamed: 0,best_of,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_ace,l_bpFaced,l_bpSaved,l_df,l_svpt,...,w_SvGms,w_ace,w_bpFaced,w_bpSaved,w_df,w_svpt,winner_age,winner_ht,winner_id,winner_rank
count,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,...,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0,50394.0
mean,3.36933,48.762234,32.589455,15.169881,12.404413,5.16881,8.653411,4.788586,3.384034,81.566853,...,12.6117,7.003989,5.053002,3.47583,2.656904,78.545124,26.481173,185.862074,104869.513315,58.673255
std,0.776059,18.748129,14.07696,7.153186,4.090114,4.880211,4.089644,3.244804,2.502525,28.385672,...,4.077552,5.551398,3.995139,3.056234,2.263459,28.35768,3.832642,6.748185,6370.799459,74.944266
min,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,15.82,163.0,100644.0,1.0
25%,3.0,35.0,22.0,10.0,9.0,2.0,6.0,2.0,2.0,60.0,...,10.0,3.0,2.0,1.0,1.0,57.0,23.64,183.0,103484.0,16.0
50%,3.0,45.0,30.0,14.0,11.0,4.0,8.0,4.0,3.0,76.0,...,11.0,6.0,4.0,3.0,2.0,73.0,26.35,185.0,104198.0,41.0
75%,3.0,59.0,40.0,19.0,15.0,7.0,11.0,7.0,5.0,97.0,...,15.0,9.0,7.0,5.0,4.0,94.0,29.12,190.0,104918.0,76.75
max,5.0,218.0,171.0,63.0,50.0,67.0,34.0,25.0,23.0,291.0,...,49.0,75.0,30.0,24.0,26.0,278.0,40.62423,208.0,206173.0,1890.0


Adding Features
- Need to use previous player performance to predict matchups
1. Previous Career High Rank


In [74]:
#TODO fix print order for loser first case

def addHeadToHead(tdf):
    matchtracker = {}
    tdf['head_to_head'] = "asdf"
    
    for id, row in tqdm.tqdm_notebook(tdf.iterrows(), total=tdf.shape[0]):
        winner = row['winner_id']
        loser = row['loser_id']
        
        if (winner, loser) in matchtracker:
            tdf.loc[id, 'winner_wins_vs_opp'] = matchtracker[(winner, loser)][0]
            tdf.loc[id, 'loser_wins_vs_opp'] = matchtracker[(winner, loser)][1]
            matchtracker[(winner, loser)][0]+=1
        elif (loser, winner) in matchtracker:
            tdf.loc[id, 'winner_wins_vs_opp'] = matchtracker[(loser, winner)][1]
            tdf.loc[id, 'loser_wins_vs_opp'] = matchtracker[(loser, winner)][0]
            matchtracker[(loser, winner)][1]+=1
        else:
            matchtracker[(winner, loser)] = [1,0]
            tdf.loc[id, 'winner_wins_vs_opp'] = 0
            tdf.loc[id, 'loser_wins_vs_opp'] = 0
        
addHeadToHead(df)
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=50395.0), HTML(value='')))









 11%|█         | 5297/50395 [00:22<00:31, 1411.80it/s][A[A[A[A[A[A[A[A




In [54]:
# Window stats
# look at average performance metrics for past year


def addPlayerYearRecords(tdf):
    for (idx, date, winner_id, loser_id, surface) in tqdm.tqdm_notebook(tdf[['tourney_date', 'winner_id', 'loser_id', 'surface']].itertuples(), total=tdf.shape[0], leave=True):
        
        one_year_ago = date - relativedelta(years=1)
        one_year_df =  tdf[(tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < date)]
        
        tdf.loc[idx, 'w_1yr_wins'] = len(one_year_df[(one_year_df['winner_id'].values == winner_id)]['winner_id'])
        tdf.loc[idx, 'w_1yr_losses'] = len(one_year_df[(one_year_df['loser_id'].values == winner_id)]['winner_id'])
        
        tdf.loc[idx, 'l_1yr_wins'] = len(one_year_df[(one_year_df['winner_id'].values == loser_id)]['winner_id'])
        tdf.loc[idx, 'l_1yr_losses'] = len(one_year_df[(one_year_df['loser_id'].values == loser_id)]['winner_id'])
        
        tdf.loc[idx, 'w_1yr_wins_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['winner_id'].values == winner_id)]['winner_id'])
        tdf.loc[idx, 'w_1yr_losses_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['loser_id'].values == winner_id)]['winner_id'])
        
        tdf.loc[idx, 'l_1yr_wins_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['winner_id'].values == loser_id)]['winner_id'])
        tdf.loc[idx, 'l_1yr_losses_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['loser_id'].values == loser_id)]['winner_id'])
        
addPlayerYearRecords(df)

# def addWindowStats(tdf):
#     tdf['winner_1yr_wins'] = -1
#     tdf['winner_1yr_loses'] = -1
#     for idx, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):
#         one_year_ago = (row['tourney_date'] - relativedelta(years=1))
#         #print(one_year_ago)
#         tdf.loc[idx, 'winner_1yr_wins'] = tdf[(tdf['winner_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]['winner_id'].count()
#         #print(test)
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=50395.0), HTML(value='')))




In [71]:
df['w_year_win_pct'] = df['w_1yr_wins'] / (df['w_1yr_wins'] + df['w_1yr_loses'])
df['w_year_surface_win_pct'] = df['w_1yr_wins_on_surface'] / (df['w_1yr_wins_on_surface'] + df['w_1yr_loses_on_surface'])

df['l_year_win_pct'] = df['l_1yr_wins'] / (df['l_1yr_wins'] + df['l_1yr_loses'])
df['l_year_surface_win_pct'] = df['l_1yr_wins_on_surface'] / (df['l_1yr_wins_on_surface'] + df['l_1yr_loses_on_surface'])

In [21]:
def addPlayerYearServe(tdf):
    tdf['winner_1yr_wins'] = -1
    tdf['winner_1yr_losses'] = -1
    for idx, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):
        
        one_year_ago = row['tourney_date'] - relativedelta(years=1)
        
        #compute serve stats for the winner's past year
        wwin_df = tdf[(tdf['winner_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        wlost_df = tdf[(tdf['loser_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        
        winner_firstin = wwin_df.w_1stIn.sum() + wlost_df.l_1stIn.sum()
        winner_svpt = wwin_df.w_svpt.sum() + wlost_df.l_svpt.sum()
        winner_ace = wwin_df.w_ace.sum() + wlost_df.l_ace.sum()
        winner_bp_faced = wwin_df.w_bpFaced.sum() + wlost_df.l_bpFaced.sum()
        winner_bp_saved = wwin_df.w_bpSaved.sum() + wlost_df.l_bpSaved.sum()
        winner_1stWon = wwin_df.w_1stWon.sum() + wlost_df.l_1stWon.sum()
        winner_2ndWon = wwin_df.w_2ndWon.sum() + wlost_df.l_2ndWon.sum()
        
        tdf.loc[idx, 'w_1yr_serve_pct'] = winner_firstin / winner_svpt
        tdf.loc[idx, 'w_1yr_ace_pct'] = winner_ace / winner_svpt
        tdf.loc[idx, 'w_1yr_bp_save_pct'] = winner_bp_saved / winner_bp_faced
        tdf.loc[idx, 'w_1yr_total_serve_pts_won_pct'] = (winner_1stWon + winner_2ndWon)/ winner_svpt
        
        #compute serve stats for the loser's last year
        lwin_df = tdf[(tdf['winner_id'] == row['loser_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        llost_df = tdf[(tdf['loser_id'] == row['loser_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        
        loser_firstin = lwin_df.w_1stIn.sum() + llost_df.l_1stIn.sum()
        loser_svpt = lwin_df.w_svpt.sum() + llost_df.l_svpt.sum()
        loser_ace = lwin_df.w_ace.sum() + llost_df.l_ace.sum()
        loser_bp_faced = lwin_df.w_bpFaced.sum() + llost_df.l_bpFaced.sum()
        loser_bp_saved = lwin_df.w_bpSaved.sum() + llost_df.l_bpSaved.sum()
        loser_1stWon = lwin_df.w_1stWon.sum() + llost_df.l_1stWon.sum()
        loser_2ndWon = lwin_df.w_2ndWon.sum() + llost_df.l_2ndWon.sum()
        
        tdf.loc[idx, 'l_1yr_serve_pct'] = loser_firstin / loser_svpt
        tdf.loc[idx, 'l_1yr_ace_pct'] = loser_ace / loser_svpt
        tdf.loc[idx, 'l_1yr_bp_save_pct'] = loser_bp_saved / loser_bp_faced
        tdf.loc[idx, 'l_1yr_total_serve_pts_won_pct'] = (loser_1stWon + loser_2ndWon)/ loser_svpt
        
addPlayerYearServe(df)

100%|██████████| 50395/50395 [09:37<00:00, 87.20it/s]


In [64]:
test_df = df[df['winner_name'] == 'Roger Federer']

def addCareerHighRank(tdf):
    tdf['winner_high_rank'] = 1001
    tdf['loser_high_rank'] = 1001
    
    col_loc = tdf.columns.get_loc("winner_high_rank")
    
    for id, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):
        curr_df = tdf.iloc[0:id]
        curr_id = row['winner_id']
        curr_df = curr_df[curr_df['winner_id'] == curr_id]
        max_winner_rank = curr_df['winner_rank'].max()
        tdf.iat[id, col_loc] = max_winner_rank
        
for id, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):

SyntaxError: unexpected EOF while parsing (<ipython-input-64-e6cac5ec4ae6>, line 16)

In [29]:
df[['winner_name', 'loser_name', 'w_1yr_serve_pct', 'w_1yr_ace_pct', 'w_1yr_bp_save_pct', 'w_1yr_total_serve_pts_won_pct', 'l_1yr_serve_pct', 'l_1yr_ace_pct', 'l_1yr_bp_save_pct', 'w_1yr_total_serve_pts_won_pct']]

Unnamed: 0,winner_name,loser_name,w_1yr_serve_pct,w_1yr_ace_pct,w_1yr_bp_save_pct,w_1yr_total_serve_pts_won_pct,l_1yr_serve_pct,l_1yr_ace_pct,l_1yr_bp_save_pct,l_1yr_total_serve_pts_won_pct
0,Lleyton Hewitt,Wayne Arthurs,,,,,,,,
1,Bjorn Phau,Todd Woodbridge,,,,,,,,
2,Xavier Malisse,Chris Woodruff,,,,,,,,
3,Tommy Haas,Luke Smith,,,,,,,,
4,Jason Stoltenberg,Sebastien Grosjean,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
58015,Dominic Thiem,Novak Djokovic,0.655112,0.061603,0.613208,0.670126,0.658496,0.077067,0.653137,0.702119
58016,Novak Djokovic,Matteo Berrettini,0.658496,0.077067,0.653137,0.702119,0.609578,0.114245,0.701220,0.689123
58017,Dominic Thiem,Roger Federer,0.655112,0.061603,0.613208,0.670126,0.642781,0.095401,0.708861,0.717219
58018,Roger Federer,Matteo Berrettini,0.642781,0.095401,0.708861,0.717219,0.609578,0.114245,0.701220,0.689123


In [77]:
df[(df['winner_name'] == 'Roger Federer') | (df['loser_name'] == 'Roger Federer')] [['winner_name', 'loser_name', 'winner_wins_vs_opp', 'loser_wins_vs_opp'] ]

Unnamed: 0,winner_name,loser_name,winner_wins_vs_opp,loser_wins_vs_opp
135,Roger Federer,Wayne Ferreira,0.0,0.0
145,Roger Federer,Marc Rosset,0.0,0.0
150,Sebastien Grosjean,Roger Federer,0.0,0.0
160,Roger Federer,Arnaud Di Pasquale,0.0,0.0
221,Roger Federer,Nicolas Escude,0.0,0.0
...,...,...,...,...
57911,Roger Federer,Peter Gojowczyk,2.0,0.0
58006,Stefanos Tsitsipas,Roger Federer,1.0,2.0
58014,Roger Federer,Novak Djokovic,20.0,26.0
58017,Dominic Thiem,Roger Federer,4.0,2.0


In [None]:
'w_1yr_wins', 'w_1yr_loses', 'w_year_win_pct', 'w_year_surface_win_pct', 'l_year_win_pct', 'l_year_surface_win_pct'] ]