In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

# Load Data

In [2]:
alldfs=[]
for yr in range(2000,2019):
    alldfs.append(pd.read_csv('./raw_data/atp_matches_{}.csv'.format(yr)))

In [3]:
df=pd.concat(alldfs)

In [4]:
df.shape

(58563, 49)

# Win Percentage

In [5]:
df.tourney_level.value_counts()

A    31849
M    10743
G     9652
D     5890
F      246
C      120
Name: tourney_level, dtype: int64

In [6]:
df.winner_name

0                     Antony Dupuis
1                 Fernando Gonzalez
2               Paradorn Srichaphan
3                     Jan Siemerink
4                 Jason Stoltenberg
5                        Jiri Vanek
6                  Martin Rodriguez
7                    Paul Goldstein
8                     Nicolas Massu
9                       Jan Kroslak
10                  Michael Russell
11                   Gianluca Pozzi
12                     Razvan Sabau
13                  Markus Hantschk
14                   Xavier Malisse
15                    Ramon Delgado
16                Fernando Gonzalez
17              Paradorn Srichaphan
18                       Jiri Vanek
19                 Martin Rodriguez
20                    Nicolas Massu
21                   Gianluca Pozzi
22                  Markus Hantschk
23                    Ramon Delgado
24                Fernando Gonzalez
25                 Martin Rodriguez
26                    Nicolas Massu
27                    Ramon 

In [7]:
def win_percentage(df,player,surface=False,max_date=False,min_date=False,n=False):
    if surface:
        df=df[df.surface==surface]
    if max_date:
        df=df[(df.tourney_date < max_date) & (df.tourney_date > min_date)].sort_values('tourney_date')
    matches=df[(df.winner_name==player) | (df.loser_name==player)]
    if n:
        matches=matches.tail(n)
    wins=len(matches[matches.winner_name==player])
    losses=len(matches[matches.loser_name==player])
    return [wins,losses,wins/(wins+losses)]

In [8]:
win_percentage(df,'Rafael Nadal',surface='Clay',max_date=20100101,n=20)

[18, 2, 0.9]

# Winning % on 1st Serve

In [9]:
def win_1st_serve(df,player,surface=False,min_date=False,max_date=False,n=False):
    if surface:
        df=df[df.surface==surface]
    if max_date:
        df=df[(df.tourney_date < max_date) & (df.tourney_date > min_date)].sort_values('tourney_date')
    matches=df[(df.winner_name==player) | (df.loser_name==player)]
    if n:
        matches=matches.tail(n)
    wins=matches[matches.winner_name==player]
    losses=matches[matches.loser_name==player]
    num=np.sum(losses.l_1stWon)+np.sum(wins.w_1stWon)
    denom=np.sum(losses.l_svpt)+np.sum(wins.w_svpt)
    return [num,denom,num/denom]

In [10]:
win_1st_serve(df,'Rafael Nadal',surface='Clay',max_date=20160101,n=2)

[76.0, 163.0, 0.4662576687116564]

# Winning % on 2nd Serve

In [11]:
def win_2nd_serve(df,player,surface=False,max_date=False,min_date=False,n=False):
    if surface:
        df=df[df.surface==surface]
    if max_date:
        df=df[(df.tourney_date < max_date) & (df.tourney_date > min_date)].sort_values('tourney_date')
    matches=df[(df.winner_name==player) | (df.loser_name==player)]
    if n:
        matches=matches.tail(n)
    wins=matches[matches.winner_name==player]
    losses=matches[matches.loser_name==player]
    l_2nd_serves=np.sum(losses.l_svpt)-np.sum(losses.l_1stIn)
    w_2nd_serves=np.sum(wins.w_svpt)-np.sum(wins.w_1stIn)
    num=np.sum(losses.l_2ndWon)+np.sum(wins.w_2ndWon)
    denom=l_2nd_serves+w_2nd_serves
    stat=num/denom
    return [num,denom,stat]

In [12]:
win_2nd_serve(df,'Rafael Nadal',surface='Clay',max_date=20160101,n=20)

[210.0, 398.0, 0.5276381909547738]

# Winning % on Return Serve

In [13]:
def win_return_serve(df,player,surface=False,max_date=False,min_date=False,n=False):
    if surface:
        df=df[df.surface==surface]
    if max_date:
        df=df[(df.tourney_date < max_date) & (df.tourney_date > min_date)].sort_values('tourney_date')
    matches=df[(df.winner_name==player) | (df.loser_name==player)]
    if n:
        matches=matches.tail(n)
    wins=matches[matches.winner_name==player]
    losses=matches[matches.loser_name==player]
    wins_opp_1st_in=np.sum(wins.l_1stIn)
    losses_opp_1st_in=np.sum(losses.w_1stIn)
    wins_opp_1st_won=np.sum(wins.l_1stWon)
    losses_opp_1st_won=np.sum(losses.w_1stWon)
    wins_opp_2nd_won=np.sum(wins.l_2ndWon)
    losses_opp_2nd_won=np.sum(losses.w_2ndWon)
    wins_opp_svpt=np.sum(wins.l_svpt)
    losses_opp_svpt=np.sum(losses.w_svpt)
    wins_opp_df=np.sum(wins.l_df)
    losses_opp_df=np.sum(losses.w_df)
    num=(wins_opp_1st_in-wins_opp_1st_won+losses_opp_1st_in-losses_opp_1st_won)+(wins_opp_svpt+losses_opp_svpt-wins_opp_1st_in-losses_opp_1st_in-wins_opp_2nd_won-losses_opp_2nd_won-wins_opp_df-losses_opp_df)
    denom=wins_opp_svpt+losses_opp_svpt
    return [num,denom,num/denom]

In [14]:
win_return_serve(df,'Rafael Nadal',surface='Clay',max_date=20160101,n=20)

[632.0, 1490.0, 0.42416107382550333]

# Preprocess

In [15]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced'],
      dtype='object')

In [16]:
df['dummy_surface']=df['surface']

In [17]:
df=pd.get_dummies(df,columns=['dummy_surface'])

In [18]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced', 'dummy_surface_Carpet', 'dummy_surface_Clay',
       'dummy_surface_Grass', 'dummy_surface_Hard', 'dummy_surface_None'],
      dtype='object')

In [19]:
def process_match(entry):
    p1=entry.winner_name
    p2=entry.loser_name
    surface=entry.surface
    p1_seed=entry.winner_seed
    p2_seed=entry.loser_seed
    p1_rank=entry.winner_rank
    p2_rank=entry.loser_rank
    dummy_surface=[entry.dummy_surface_Carpet,entry.dummy_surface_Clay,entry.dummy_surface_Grass,entry.dummy_surface_Hard,entry.dummy_surface_None]
    p1_stats=[p1_seed,p1_rank]
    for func in [win_percentage,win_1st_serve,win_2nd_serve,win_return_serve]:
        p1_stats.extend(func(df,p1,max_date=entry.tourney_date-1,min_date=entry.tourney_date-50000))
        p1_stats.extend(func(df,p1,max_date=entry.tourney_date-1,min_date=entry.tourney_date-10000))
        p1_stats.extend(func(df,p1,surface=surface,max_date=entry.tourney_date-1,min_date=entry.tourney_date-50000))
        p1_stats.extend(func(df,p1,surface=surface,max_date=entry.tourney_date-1,min_date=entry.tourney_date-10000))
    p2_stats=[p2_seed,p2_rank]
    for func in [win_percentage,win_1st_serve,win_2nd_serve,win_return_serve]:
        p2_stats.extend(func(df,p2,max_date=entry.tourney_date-1,min_date=entry.tourney_date-50000))
        p2_stats.extend(func(df,p2,max_date=entry.tourney_date-1,min_date=entry.tourney_date-10000))
        p2_stats.extend(func(df,p2,surface=surface,max_date=entry.tourney_date-1,min_date=entry.tourney_date-50000))
        p2_stats.extend(func(df,p2,surface=surface,max_date=entry.tourney_date-1,min_date=entry.tourney_date-10000))
    return [p1_stats,p2_stats]

In [20]:
df=df.reset_index()

In [21]:
df=df[df.tourney_date>20100101]

In [22]:
df.shape

(26252, 55)

In [23]:
data=[]
cnt=0
for entry in tqdm_notebook(df.itertuples()):
    try:
        stats=process_match(entry)
        data.append(stats)
    except:
        cnt+=1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  del sys.path[0]
  from ipykernel import kernelapp as app





In [31]:
clean_data=[]
from copy import deepcopy
for x1,x2 in data:
    if np.isnan(x1[2:]).any() or np.isnan(x2[2:]).any():
        continue
    clean_data.append([x1,x2])

In [32]:
len(clean_data)

21557

In [33]:
norm=[]
for x1,x2 in clean_data:
    norm.extend([x1,x2])

In [34]:
maxs=np.nanmax(norm,axis=0)

In [35]:
special_max=maxs[0:2]

In [36]:
maxs+=maxs*.05

In [43]:
norm_data=[]
results=[]
from copy import deepcopy
for x1,x2 in clean_data:
    for entry in [x1,x2]:
        for ind in range(len(entry)):
            if ind<=1:
                entry[ind]=((entry[ind]-special_max[ind]-1)/-1)
            entry[ind]/=maxs[ind]
    if x1[1]<x2[1]:
        st=deepcopy(x1)
        st.extend(x2)
        results.append(0)
    else:
        st=deepcopy(x2)
        st.extend(x1)
        results.append(1)
    norm_data.append(st)

In [44]:
norm_data=np.array(norm_data)

In [45]:
norm_data=np.nan_to_num(norm_data)

In [46]:
results=np.array(results)

In [47]:
len(results)

21557

In [49]:
import pickle

In [50]:
np.save('./processed_data/since_2010_processed_data.npy',norm_data)

In [51]:
np.save('./processed_data/since_2010_results.npy',results)