In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from collections import Counter

In [3]:
data_path = '../tennis_atp-master/'
with_davis_cup = True

In [4]:
year = 2012
df = pd.read_csv(data_path + "atp_matches_%s.csv" % year)
if not with_davis_cup:
    df = df[df.tourney_name.str.contains("Davis") == False]

In [5]:
print(year, "has %d matches." % len(df))

(2012, 'has 3025 matches.')


In [6]:
assert(len(df['tourney_id'].unique()) == len(df['tourney_name'].unique()))

In [7]:
print("There are %d tournaments in %s." % (len(df['tourney_id'].unique()), year))

There are 148 tournaments in 2012.


In [8]:
tournaments = df['tourney_name'].unique()
# print(sorted(tournaments))

In [9]:
winner_names = set(df['winner_name'])
loser_names = set(df['loser_name'])
number_players = len(winner_names | loser_names)
print("%d players play in %s" % (number_players, year))
print("%d players never won a match." % len([l for l in loser_names if l not in winner_names]))

457 players play in 2012
152 players never won a match.


## Generate winning and losing slice of a player
The winning slice contains information when the player eventually won the match, and the losing slice vice versa.

In [10]:
query_player = "Novak Djokovic"

In [11]:
winning_slice = df[df["winner_name"] == query_player]
losing_slice = df[df["loser_name"] == query_player]

In [40]:
def is_winning_1st_set(score):
    if score == 'W/O' or score[:3] == 'RET':
        return False
    sets = score.split()
    set1 = sets[0][:3]
    a, b = set1.split('-')
    return a > b

In [42]:
champions = winning_slice[winning_slice['round'] == 'F']

In [43]:
champions

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
634,2012-421,Canada Masters,Hard,48,M,20120806,47,104925,1.0,,...,4.0,3.0,1.0,42.0,25.0,19.0,5.0,8.0,0.0,3.0
1251,2012-403,Miami Masters,Hard,96,M,20120321,95,104925,1.0,,...,1.0,6.0,3.0,85.0,47.0,29.0,20.0,9.0,6.0,8.0
2142,2012-747,Beijing,Hard,32,A,20121001,31,104925,1.0,,...,6.0,9.0,3.0,73.0,43.0,30.0,10.0,10.0,5.0,8.0
2269,2012-580,Australian Open,Hard,128,G,20120116,127,104925,1.0,,...,6.0,10.0,4.0,203.0,135.0,88.0,32.0,27.0,13.0,20.0
2654,2012-5014,Shanghai Masters,Hard,56,M,20121007,55,104925,2.0,,...,6.0,4.0,3.0,128.0,71.0,46.0,26.0,17.0,7.0,13.0
2697,2012-605,Tour Finals,Hard,8,F,20121105,15,104925,1.0,,...,7.0,8.0,2.0,93.0,57.0,35.0,19.0,12.0,3.0,7.0


In [41]:
map(lambda x: is_winning_1st_set(x), winning_slice.score)

[True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True]

In [38]:
winning_slice.score

82                     6-1 6-4
98                 2-6 6-1 6-4
106                    6-4 6-2
110                4-6 6-3 6-2
137                    6-3 6-3
153                4-6 6-2 6-3
161                    7-5 6-1
165                 6-2 7-6(4)
545                 7-6(4) 6-2
553                    6-0 RET
557                    6-3 6-2
559                    6-3 6-2
604                    6-2 6-3
620                    6-4 6-4
628                6-3 3-6 6-3
632                    6-4 6-1
634                    6-3 6-2
802                6-2 2-6 6-3
818                 7-6(5) 6-4
1003               6-3 6-3 6-1
1067               6-4 6-4 6-4
1099           4-6 6-2 6-2 6-2
1115               6-3 6-1 6-3
1123               6-4 6-1 6-4
1189                   6-4 6-4
1221                   6-3 6-4
1237                   7-5 6-3
1245                6-2 7-6(1)
1249                6-0 7-6(5)
1251                6-1 7-6(4)
                 ...          
2136                   6-1 6-2
2140    

In [12]:
def function(string):
    return len(string.split()) == 3

In [13]:
best_of_3 = winning_slice['best_of'] == 3
best_of_5 = winning_slice['best_of'] == 5
sum(map(lambda s: len(s.split()) == 3, winning_slice[best_of_3].score))
sum(map(lambda s: len(s.split()) == 5, winning_slice[best_of_5].score))

4

In [14]:
sum(map(lambda s: sum(map(lambda x:'7-6' in x, s.split())), winning_slice.score))

17

In [15]:
# wins_count = Counter(df['winner_name'])
# wins_count_sorted = sorted(wins_count.items(), key=lambda x:-x[1])
# loses_count = Counter(df['loser_name'])
# loses_count_sorted = sorted(loses_count.items(), key=lambda x:x[1])

In [16]:
n_winning, n_losing = len(winning_slice), len(losing_slice)

In [17]:
print("%s in %d: %d wins, %d losses." % (query_player, year, n_winning, n_losing))

Novak Djokovic in 2012: 75 wins, 12 losses.


In [18]:
p1, p2= "Roger Federer", "Rafael Nadal"

In [19]:
df = pd.read_csv(data_path + "atp_matches_%s.csv" % year)
df = df[((df['winner_name'] == p1) & (df['loser_name'] == p2)) | ((df['loser_name'] == p1) & (df['winner_name'] == p2))]


In [20]:
df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
2268,2012-580,Australian Open,Hard,128,G,20120116,126,104745,2.0,,...,10.0,11.0,5.0,135.0,86.0,57.0,27.0,21.0,10.0,16.0
2571,2012-404,Indian Wells Masters,Hard,96,M,20120308,94,103819,3.0,,...,4.0,0.0,2.0,56.0,40.0,23.0,7.0,9.0,2.0,6.0


In [21]:
df1 = df[(df['winner_name'] == 'Roger Federer') & (df['loser_name'] == 'Rafael Nadal')]
df2 = df[(df['loser_name'] == 'Roger Federer') & (df['winner_name'] == 'Rafael Nadal')]



In [22]:
dict(df1.iloc[0])

{'best_of': 3,
 'draw_size': 96,
 'l_1stIn': 40.0,
 'l_1stWon': 23.0,
 'l_2ndWon': 7.0,
 'l_SvGms': 9.0,
 'l_ace': 0.0,
 'l_bpFaced': 6.0,
 'l_bpSaved': 2.0,
 'l_df': 2.0,
 'l_svpt': 56.0,
 'loser_age': 25.7549623546,
 'loser_entry': nan,
 'loser_hand': 'L',
 'loser_ht': 185.0,
 'loser_id': 104745,
 'loser_ioc': 'ESP',
 'loser_name': 'Rafael Nadal',
 'loser_rank': 2.0,
 'loser_rank_points': 10415.0,
 'loser_seed': 2.0,
 'match_num': 94,
 'minutes': 92.0,
 'round': 'SF',
 'score': '6-3 6-4',
 'surface': 'Hard',
 'tourney_date': 20120308,
 'tourney_id': '2012-404',
 'tourney_level': 'M',
 'tourney_name': 'Indian Wells Masters',
 'w_1stIn': 36.0,
 'w_1stWon': 23.0,
 'w_2ndWon': 13.0,
 'w_SvGms': 10.0,
 'w_ace': 6.0,
 'w_bpFaced': 4.0,
 'w_bpSaved': 2.0,
 'w_df': 2.0,
 'w_svpt': 56.0,
 'winner_age': 30.573579739899998,
 'winner_entry': nan,
 'winner_hand': 'R',
 'winner_ht': 185.0,
 'winner_id': 103819,
 'winner_ioc': 'SUI',
 'winner_name': 'Roger Federer',
 'winner_rank': 3.0,
 'winner_ra

## Head-to-head 

In [23]:
head_2_heads = Counter(zip(df["winner_name"], df["loser_name"]))

In [24]:
query_head2head = ("Novak Djokovic", "Roger Federer")
print("H2H - %s %d:%d %s" % (query_head2head[0], head_2_heads[query_head2head], head_2_heads[query_head2head[::-1]], query_head2head[1]))

H2H - Novak Djokovic 0:0 Roger Federer


## Aces

In [25]:
df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
2268,2012-580,Australian Open,Hard,128,G,20120116,126,104745,2.0,,...,10.0,11.0,5.0,135.0,86.0,57.0,27.0,21.0,10.0,16.0
2571,2012-404,Indian Wells Masters,Hard,96,M,20120308,94,103819,3.0,,...,4.0,0.0,2.0,56.0,40.0,23.0,7.0,9.0,2.0,6.0


In [26]:
df.columns

Index([u'tourney_id', u'tourney_name', u'surface', u'draw_size',
       u'tourney_level', u'tourney_date', u'match_num', u'winner_id',
       u'winner_seed', u'winner_entry', u'winner_name', u'winner_hand',
       u'winner_ht', u'winner_ioc', u'winner_age', u'winner_rank',
       u'winner_rank_points', u'loser_id', u'loser_seed', u'loser_entry',
       u'loser_name', u'loser_hand', u'loser_ht', u'loser_ioc', u'loser_age',
       u'loser_rank', u'loser_rank_points', u'score', u'best_of', u'round',
       u'minutes', u'w_ace', u'w_df', u'w_svpt', u'w_1stIn', u'w_1stWon',
       u'w_2ndWon', u'w_SvGms', u'w_bpSaved', u'w_bpFaced', u'l_ace', u'l_df',
       u'l_svpt', u'l_1stIn', u'l_1stWon', u'l_2ndWon', u'l_SvGms',
       u'l_bpSaved', u'l_bpFaced'],
      dtype='object')

计算胜场场均Ace，负场场均Ace，总场均Ace，以及上述各自的方差。

同时也计算每个发球局平均发出的Ace

In [27]:
df.w_svpt

2268    141.0
2571     56.0
Name: w_svpt, dtype: float64

In [28]:
avg_ace_winning = np.sum(winning_slice.w_ace) / n_winning
std_ace_winning = np.std(winning_slice.w_ace) 
avg_ace_losing = np.sum(losing_slice.l_ace) / n_losing
std_ace_losing = np.std(losing_slice.l_ace)
avg_ace_overall = (np.sum(winning_slice.w_ace) + np.sum(losing_slice.l_ace)) / (n_winning + n_losing)
std_ace_overall = np.std(pd.concat([winning_slice.w_ace, losing_slice.l_ace]))

In [29]:
np.sum(winning_slice.w_ace), n_winning

(456.0, 75)

In [30]:
print("Aces per match when winning: %f" % avg_ace_winning)
print("Aces per match when losing: %f" % avg_ace_losing)
print("Aces per match overall: %f" % avg_ace_overall)

Aces per match when winning: 6.080000
Aces per match when losing: 3.833333
Aces per match overall: 5.770115


In [31]:
avg_ace_winning_per_game = np.sum(winning_slice.w_ace) / np.sum(winning_slice['w_SvGms'])
# std_ace_winning_per_game = np.std(winning_slice.w_ace) 
avg_ace_losing_per_game = np.sum(losing_slice.l_ace) / np.sum(losing_slice['l_SvGms'])
# std_ace_losing_per_game = np.std(losing_slice.l_ace)
avg_ace_overall_per_game = (np.sum(winning_slice.w_ace) + np.sum(losing_slice.l_ace)) / (np.sum(winning_slice['w_SvGms']) +  np.sum(losing_slice['l_SvGms']))
# std_ace_overall_per_game = np.std(pd.concat([winning_slice.w_ace, losing_slice.l_ace]))

In [32]:
print("Aces per service game when winning: %f" % avg_ace_winning_per_game)
print("Aces per service game when losing: %f" % avg_ace_losing_per_game)
print("Aces per service game overall: %f" % avg_ace_overall_per_game)

Aces per service game when winning: 0.501650
Aces per service game when losing: 0.300654
Aces per service game overall: 0.472693


In [33]:
ace_rate_winning = np.sum(winning_slice.w_ace) / np.sum(winning_slice['w_svpt'])
ace_rate_losing = np.sum(losing_slice.l_ace) / np.sum(losing_slice['l_svpt'])
ace_rate_overall = (np.sum(winning_slice.w_ace) + np.sum(losing_slice.l_ace)) / (np.sum(winning_slice['w_svpt']) + np.sum(losing_slice['l_svpt']))

In [34]:
print("Aces rate when winning: %f" % ace_rate_winning)
print("Aces rate when losing: %f" % ace_rate_losing)
print("Aces rate overall: %f" % ace_rate_overall)

Aces rate when winning: 0.082654
Aces rate when losing: 0.047867
Aces rate overall: 0.077493


In [35]:
# winning_slice[['w_ace', 'w_svpt']].sort_values(by=['w_ace'])
winning_slice

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
82,2012-410,Monte Carlo Masters,Clay,56,M,20120415,25,104925,1.0,,...,3.0,0.0,0.0,66.0,40.0,22.0,13.0,8.0,6.0,9.0
98,2012-410,Monte Carlo Masters,Clay,56,M,20120415,41,104925,1.0,,...,5.0,5.0,0.0,87.0,49.0,32.0,20.0,12.0,6.0,9.0
106,2012-410,Monte Carlo Masters,Clay,56,M,20120415,49,104925,1.0,,...,8.0,0.0,0.0,61.0,37.0,16.0,7.0,9.0,5.0,12.0
110,2012-410,Monte Carlo Masters,Clay,56,M,20120415,53,104925,1.0,,...,10.0,3.0,8.0,89.0,41.0,31.0,21.0,13.0,3.0,7.0
137,2012-416,Rome Masters,Clay,56,M,20120513,25,104925,1.0,,...,2.0,2.0,0.0,63.0,43.0,25.0,8.0,9.0,4.0,8.0
153,2012-416,Rome Masters,Clay,56,M,20120513,41,104925,1.0,,...,4.0,0.0,2.0,79.0,61.0,37.0,9.0,13.0,2.0,6.0
161,2012-416,Rome Masters,Clay,56,M,20120513,49,104925,1.0,,...,4.0,3.0,2.0,52.0,38.0,21.0,4.0,9.0,5.0,10.0
165,2012-416,Rome Masters,Clay,56,M,20120513,53,104925,1.0,,...,1.0,6.0,0.0,65.0,32.0,25.0,16.0,10.0,3.0,6.0
545,2012-422,Cincinnati Masters,Hard,56,M,20120812,40,104925,2.0,,...,0.0,3.0,3.0,78.0,41.0,26.0,19.0,10.0,8.0,10.0
553,2012-422,Cincinnati Masters,Hard,56,M,20120812,48,104925,2.0,,...,0.0,0.0,8.0,27.0,13.0,6.0,4.0,3.0,3.0,6.0


In [36]:
import flask