In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
from collections import Counter

In [7]:
data_path = '../tennis_atp-master/'
with_davis_cup = True

In [8]:
year = 2012
df = pd.read_csv(data_path + "atp_matches_%s.csv" % year)
if not with_davis_cup:
    df = df[df.tourney_name.str.contains("Davis") == False]

In [9]:
print(year, "has %d matches." % len(df))

2012 has 3025 matches.


In [10]:
assert(len(df['tourney_id'].unique()) == len(df['tourney_name'].unique()))

In [11]:
print("There are %d tournaments in %s." % (len(df['tourney_id'].unique()), year))

There are 148 tournaments in 2012.


In [13]:
tournaments = df['tourney_name'].unique()
# print(sorted(tournaments))

In [12]:
winner_names = set(df['winner_name'])
loser_names = set(df['loser_name'])
number_players = len(winner_names | loser_names)
print("%d players play in %s" % (number_players, year))
print("%d players never won a match." % len([l for l in loser_names if l not in winner_names]))

457 players play in 2012
152 players never won a match.


## Generate winning and losing slice of a player
The winning slice contains information when the player eventually won the match, and the losing slice vice versa.

In [15]:
query_player = "Novak Djokovic"

In [16]:
winning_slice = df[df["winner_name"] == query_player]
losing_slice = df[df["loser_name"] == query_player]

In [35]:
def function(string):
    return len(string.split()) == 3

In [41]:
best_of_3 = winning_slice['best_of'] == 3
best_of_5 = winning_slice['best_of'] == 5
sum(map(lambda s: len(s.split()) == 3, winning_slice[best_of_3].score))
sum(map(lambda s: len(s.split()) == 5, winning_slice[best_of_5].score))

4

In [24]:
sum(map(lambda s: sum(map(lambda x:'7-6' in x, s.split())), winning_slice.score))

17

In [None]:
# wins_count = Counter(df['winner_name'])
# wins_count_sorted = sorted(wins_count.items(), key=lambda x:-x[1])
# loses_count = Counter(df['loser_name'])
# loses_count_sorted = sorted(loses_count.items(), key=lambda x:x[1])

In [None]:
n_winning, n_losing = len(winning_slice), len(losing_slice)

In [None]:
print("%s in %d: %d wins, %d losses." % (query_player, year, n_winning, n_losing))

In [None]:
p1, p2= "Roger Federer", "Rafael Nadal"

In [None]:
df = pd.read_csv(data_path + "atp_matches_%s.csv" % year)
df = df[((df['winner_name'] == p1) & (df['loser_name'] == p2)) | ((df['loser_name'] == p1) & (df['winner_name'] == p2))]


In [None]:
df

In [None]:
df1 = df[(df['winner_name'] == 'Roger Federer') & (df['loser_name'] == 'Rafael Nadal')]
df2 = df[(df['loser_name'] == 'Roger Federer') & (df['winner_name'] == 'Rafael Nadal')]



In [None]:
dict(df1.iloc[0])

## Head-to-head 

In [None]:
head_2_heads = Counter(zip(df["winner_name"], df["loser_name"]))

In [None]:
query_head2head = ("Novak Djokovic", "Roger Federer")
print("H2H - %s %d:%d %s" % (query_head2head[0], head_2_heads[query_head2head], head_2_heads[query_head2head[::-1]], query_head2head[1]))

## Aces

In [None]:
df

In [None]:
df.columns

计算胜场场均Ace，负场场均Ace，总场均Ace，以及上述各自的方差。

同时也计算每个发球局平均发出的Ace

In [None]:
df.w_svpt

In [None]:
avg_ace_winning = np.sum(winning_slice.w_ace) / n_winning
std_ace_winning = np.std(winning_slice.w_ace) 
avg_ace_losing = np.sum(losing_slice.l_ace) / n_losing
std_ace_losing = np.std(losing_slice.l_ace)
avg_ace_overall = (np.sum(winning_slice.w_ace) + np.sum(losing_slice.l_ace)) / (n_winning + n_losing)
std_ace_overall = np.std(pd.concat([winning_slice.w_ace, losing_slice.l_ace]))

In [None]:
np.sum(winning_slice.w_ace), n_winning

In [None]:
print("Aces per match when winning: %f" % avg_ace_winning)
print("Aces per match when losing: %f" % avg_ace_losing)
print("Aces per match overall: %f" % avg_ace_overall)

In [None]:
avg_ace_winning_per_game = np.sum(winning_slice.w_ace) / np.sum(winning_slice['w_SvGms'])
# std_ace_winning_per_game = np.std(winning_slice.w_ace) 
avg_ace_losing_per_game = np.sum(losing_slice.l_ace) / np.sum(losing_slice['l_SvGms'])
# std_ace_losing_per_game = np.std(losing_slice.l_ace)
avg_ace_overall_per_game = (np.sum(winning_slice.w_ace) + np.sum(losing_slice.l_ace)) / (np.sum(winning_slice['w_SvGms']) +  np.sum(losing_slice['l_SvGms']))
# std_ace_overall_per_game = np.std(pd.concat([winning_slice.w_ace, losing_slice.l_ace]))

In [None]:
print("Aces per service game when winning: %f" % avg_ace_winning_per_game)
print("Aces per service game when losing: %f" % avg_ace_losing_per_game)
print("Aces per service game overall: %f" % avg_ace_overall_per_game)

In [None]:
ace_rate_winning = np.sum(winning_slice.w_ace) / np.sum(winning_slice['w_svpt'])
ace_rate_losing = np.sum(losing_slice.l_ace) / np.sum(losing_slice['l_svpt'])
ace_rate_overall = (np.sum(winning_slice.w_ace) + np.sum(losing_slice.l_ace)) / (np.sum(winning_slice['w_svpt']) + np.sum(losing_slice['l_svpt']))

In [None]:
print("Aces rate when winning: %f" % ace_rate_winning)
print("Aces rate when losing: %f" % ace_rate_losing)
print("Aces rate overall: %f" % ace_rate_overall)

In [None]:
# winning_slice[['w_ace', 'w_svpt']].sort_values(by=['w_ace'])
winning_slice

In [None]:
import flask