In [200]:
import pandas as pd
from matplotlib import pyplot as plt

### Read in the match points from 2011-2021 for grand-slam tournaments

In [None]:
points_2011 = pd.read_csv(f"../data/grand-slam-point-data/combined-points/2011-combined-points.csv", encoding='unicode_escape', low_memory=False)
match_2011 = pd.read_csv(f"../data/grand-slam-point-data/combined-matches/2011-combined-matches.csv", encoding='unicode_escape', low_memory=False)
combined = pd.merge(points_2011, match_2011, on="match_id", how="inner")
for i in range(2012, 2022):
    points = pd.read_csv(f"../data/grand-slam-point-data/combined-points/{i}-combined-points.csv", encoding='unicode_escape', low_memory=False)
    match = pd.read_csv(f"../data/grand-slam-point-data/combined-matches/{i}-combined-matches.csv", encoding='unicode_escape', low_memory=False)
    cur_year = pd.merge(points, match, on="match_id", how="inner")
    combined = pd.concat([combined, cur_year], sort=True)

points = combined

P1BreakPoint == 1 if P1 is returning and is one point from winning the game ("breaking" the opponent's serve point)

In [None]:
points.columns

In [None]:
real_points = points[points['PointWinner'].ge(1)]

In [None]:
real_points[real_points["PointServer"].isna()]["match_id"].nunique()

In [None]:
real_points["PointServer"]

In [None]:
# P1 has a breakpoint when P2 is serving, and P1 is up 0-40, 15-40, 30-40, or 40-AD
p1_breakpoint =\
  (real_points["PointServer"] == 2) &\
  (((real_points["P1Score"] == "40") &
    (real_points["P2Score"] != "40") &
    (real_points["P2Score"] != "AD")) |
  (real_points["P1Score"] == "AD"))
p2_breakpoint =\
  (real_points["PointServer"] == 1) &\
  (((real_points["P2Score"] == "40") &
    (real_points["P1Score"] != "40") &
    (real_points["P1Score"] != "AD")) |
  (real_points["P2Score"] == "AD"))
real_points.loc[:, "P1BreakPoint"] = p1_breakpoint.shift(1, fill_value=False) # P1Score is P1's score after the point is played
real_points.loc[:, "P2BreakPoint"] = p2_breakpoint.shift(1, fill_value=False)

In [None]:
p1_breakpoint.shift(1, fill_value=False)

In [None]:
real_points[["P1Score", "P2Score", "PointServer", "PointWinner", "P1BreakPoint"]][:6]

In [None]:
bp_groups = real_points.groupby(by=["player1_new", "player2_new", "P1BreakPoint", "P2BreakPoint", "PointWinner", "PointServer"])
counts_by_bp = bp_groups.size().reset_index(name="count")

In [None]:
unique_players = set(counts_by_bp["player1_new"].unique().tolist() + counts_by_bp["player2_new"].unique().tolist())
player_bps_dict = dict()
for p in unique_players:
  player_bps_dict[p] = {
    'noserve_nobreak_won': 0,
    'noserve_nobreak_faced': 0,
    'serve_nobreak_won': 0,
    'serve_nobreak_faced': 0,
    'noserve_break_won': 0,
    'noserve_break_faced': 0,
    'serve_break_won': 0,
    'serve_break_faced': 0,
  }

In [None]:
for row in counts_by_bp.to_dict("records"):
  if row["P1BreakPoint"]:
    if row["PointWinner"] == 1:
      player_bps_dict[row["player1_new"]]["noserve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["noserve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["serve_break_faced"] += row["count"]
    else:
      player_bps_dict[row["player2_new"]]["serve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["noserve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["serve_break_faced"] += row["count"]
  elif row["P2BreakPoint"]:
    if row["PointWinner"] == 1:
      player_bps_dict[row["player1_new"]]["serve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["serve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["noserve_break_faced"] += row["count"]
    else:
      player_bps_dict[row["player2_new"]]["noserve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["serve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["noserve_break_faced"] += row["count"]
  else:
    if row["PointWinner"] == 1:
      if row["PointServer"] == 1:
        player_bps_dict[row["player1_new"]]["serve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["serve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["noserve_nobreak_faced"] += row["count"]
      else:
        player_bps_dict[row["player1_new"]]["noserve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["noserve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["serve_nobreak_faced"] += row["count"]
    else:
      if row["PointServer"] == 1:
        player_bps_dict[row["player2_new"]]["noserve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["serve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["noserve_nobreak_faced"] += row["count"]
      else:
        player_bps_dict[row["player2_new"]]["serve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["noserve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["serve_nobreak_faced"] += row["count"]

In [None]:
player_bps = pd.DataFrame.from_dict(player_bps_dict, orient="index")

In [None]:
18991 / 85995

In [None]:
player_bps.aggregate("sum")

In [None]:
player_bps.sort_values("noserve_break_faced")

In [None]:
# laplace smoothing assuming 64% serve win pct over 100 ghost points. assume 60% for break points
player_bps["serve_nobreak_pct"] = (player_bps["serve_nobreak_won"] + 64) / (player_bps["serve_nobreak_faced"] + 100)
player_bps["serve_break_pct"] = (player_bps["serve_break_won"] + 60) / (player_bps["serve_break_faced"] + 100)
player_bps["noserve_nobreak_pct"] = (player_bps["noserve_nobreak_won"] + 36) / (player_bps["noserve_nobreak_faced"] + 100)
player_bps["noserve_break_pct"] = (player_bps["noserve_break_won"] + 40) / (player_bps["noserve_break_faced"] + 100)

player_bps["serve_choke_pct"] = player_bps["serve_nobreak_pct"] - player_bps["serve_break_pct"] - 0.02 # correcting for 4% difference in averages
player_bps["noserve_choke_pct"] = player_bps["noserve_nobreak_pct"] - player_bps["noserve_break_pct"] + 0.04
player_bps["choke_pct"] = (player_bps["serve_choke_pct"] + player_bps["noserve_choke_pct"]) / 2

In [None]:
player_bps.agg("mean")

In [None]:
chokiest = player_bps[player_bps["noserve_break_faced"].ge(100)].sort_values("choke_pct")

In [None]:
chokiest.to_csv("../data/chokiest.csv")

In [213]:
18991 / 85995

0.22083842083842084

In [214]:
player_bps.aggregate("sum")

noserve_nobreak_won      334318
noserve_nobreak_faced    931416
serve_nobreak_won        597098
serve_nobreak_faced      931416
noserve_break_won         34029
noserve_break_faced       85995
serve_break_won           51966
serve_break_faced         85995
dtype: int64

In [215]:
player_bps.sort_values("noserve_break_faced")

Unnamed: 0,noserve_nobreak_won,noserve_nobreak_faced,serve_nobreak_won,serve_nobreak_faced,noserve_break_won,noserve_break_faced,serve_break_won,serve_break_faced
C Niland,6,30,7,24,0,0,1,6
C Altamirano,15,62,30,62,0,0,5,12
C Hemery,16,65,25,56,0,1,5,11
S Riffice,18,81,48,86,1,1,6,10
B Mott,23,81,43,70,1,1,3,8
...,...,...,...,...,...,...,...,...
K Nishikori,5210,13142,8392,13032,652,1507,677,1127
A Murray,6711,16068,10578,15950,871,1966,745,1203
R Federer,7720,19267,13106,18309,897,2189,644,963
R Nadal,7359,17174,11693,16995,1079,2308,819,1213


In [216]:
# laplace smoothing assuming 64% serve win pct over 100 ghost points. assume 60% for break points
player_bps["serve_nobreak_pct"] = (player_bps["serve_nobreak_won"] + 64) / (player_bps["serve_nobreak_faced"] + 100)
player_bps["serve_break_pct"] = (player_bps["serve_break_won"] + 60) / (player_bps["serve_break_faced"] + 100)
player_bps["noserve_nobreak_pct"] = (player_bps["noserve_nobreak_won"] + 36) / (player_bps["noserve_nobreak_faced"] + 100)
player_bps["noserve_break_pct"] = (player_bps["noserve_break_won"] + 40) / (player_bps["noserve_break_faced"] + 100)

player_bps["serve_choke_pct"] = player_bps["serve_nobreak_pct"] - player_bps["serve_break_pct"] - 0.02 # correcting for 4% difference in averages
player_bps["noserve_choke_pct"] = player_bps["noserve_nobreak_pct"] - player_bps["noserve_break_pct"] + 0.04
player_bps["choke_pct"] = (player_bps["serve_choke_pct"] + player_bps["noserve_choke_pct"]) / 2

In [217]:
player_bps.agg("mean")

noserve_nobreak_won       744.583519
noserve_nobreak_faced    2074.423163
serve_nobreak_won        1329.839644
serve_nobreak_faced      2074.423163
noserve_break_won          75.788419
noserve_break_faced       191.525612
serve_break_won           115.737194
serve_break_faced         191.525612
serve_nobreak_pct           0.617104
serve_break_pct             0.594438
noserve_nobreak_pct         0.342963
noserve_break_pct           0.392057
serve_choke_pct             0.002666
noserve_choke_pct          -0.009094
choke_pct                  -0.003214
dtype: float64

In [218]:
chokiest = player_bps[player_bps["noserve_break_faced"].ge(100)].sort_values("choke_pct")

In [219]:
chokiest.to_csv("../data/chokiest.csv")