In [141]:
import pandas as pd
from matplotlib import pyplot as plt

### Read in the match points from 2011-2021 for grand-slam tournaments

In [142]:
points_2011 = pd.read_csv(f"../data/grand-slam-point-data/combined-points/2011-combined-points.csv", encoding='unicode_escape', low_memory=False)
match_2011 = pd.read_csv(f"../data/grand-slam-point-data/combined-matches/2011-combined-matches.csv", encoding='unicode_escape', low_memory=False)
combined = pd.merge(points_2011, match_2011, on="match_id", how="inner")
for i in range(2012, 2022):
    points = pd.read_csv(f"../data/grand-slam-point-data/combined-points/{i}-combined-points.csv", encoding='unicode_escape', low_memory=False)
    match = pd.read_csv(f"../data/grand-slam-point-data/combined-matches/{i}-combined-matches.csv", encoding='unicode_escape', low_memory=False)
    cur_year = pd.merge(points, match, on="match_id", how="inner")
    combined = pd.concat([combined, cur_year], sort=True)

points = combined

P1BreakPoint == 1 if P1 is returning and is one point from winning the game ("breaking" the opponent's serve point)

In [143]:
points.columns

Index(['ElapsedTime', 'GameNo', 'GameWinner', 'History', 'P1Ace',
       'P1BreakPoint', 'P1BreakPointMissed', 'P1BreakPointWon',
       'P1DistanceRun', 'P1DoubleFault', 'P1FirstSrvIn', 'P1FirstSrvWon',
       'P1ForcedError', 'P1GamesWon', 'P1Momentum', 'P1NetPoint',
       'P1NetPointWon', 'P1PointsWon', 'P1Score', 'P1SecondSrvIn',
       'P1SecondSrvWon', 'P1TurningPoint', 'P1UnfErr', 'P1Winner', 'P2Ace',
       'P2BreakPoint', 'P2BreakPointMissed', 'P2BreakPointWon',
       'P2DistanceRun', 'P2DoubleFault', 'P2FirstSrvIn', 'P2FirstSrvWon',
       'P2ForcedError', 'P2GamesWon', 'P2Momentum', 'P2NetPoint',
       'P2NetPointWon', 'P2PointsWon', 'P2Score', 'P2SecondSrvIn',
       'P2SecondSrvWon', 'P2TurningPoint', 'P2UnfErr', 'P2Winner',
       'PointNumber', 'PointServer', 'PointWinner', 'Rally', 'RallyCount',
       'ReturnDepth', 'ServeDepth', 'ServeIndicator', 'ServeNumber',
       'ServeWidth', 'Serve_Direction', 'ServingTo', 'SetNo', 'SetWinner',
       'Speed_KMH', 'Speed_MPH

In [144]:
real_points = points[points['PointWinner'].ge(1)]

In [145]:
real_points[real_points["PointServer"].isna()]["match_id"].nunique()

0

In [146]:
real_points["PointServer"]

1        2.0
2        2.0
3        2.0
4        2.0
5        1.0
        ... 
87477    2.0
87478    2.0
87479    2.0
87480    2.0
87481    2.0
Name: PointServer, Length: 1017411, dtype: float64

In [147]:
# P1 has a breakpoint when P2 is serving, and P1 is up 0-40, 15-40, 30-40, or 40-AD
p1_breakpoint =\
  (real_points["PointServer"] == 2) &\
  (((real_points["P1Score"] == "40") &
    (real_points["P2Score"] != "40") &
    (real_points["P2Score"] != "AD")) |
  (real_points["P1Score"] == "AD"))
p2_breakpoint =\
  (real_points["PointServer"] == 1) &\
  (((real_points["P2Score"] == "40") &
    (real_points["P1Score"] != "40") &
    (real_points["P1Score"] != "AD")) |
  (real_points["P2Score"] == "AD"))
real_points.loc[:, "P1BreakPoint"] = p1_breakpoint.shift(1, fill_value=False) # P1Score is P1's score after the point is played
real_points.loc[:, "P2BreakPoint"] = p2_breakpoint.shift(1, fill_value=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [148]:
p1_breakpoint.shift(1, fill_value=False)

1        False
2        False
3        False
4         True
5        False
         ...  
87477     True
87478    False
87479     True
87480    False
87481     True
Length: 1017411, dtype: bool

In [149]:
real_points[["P1Score", "P2Score", "PointServer", "PointWinner", "P1BreakPoint"]][:6]

Unnamed: 0,P1Score,P2Score,PointServer,PointWinner,P1BreakPoint
1,15,0,2.0,1.0,False
2,30,0,2.0,1.0,False
3,40,0,2.0,1.0,False
4,0,0,2.0,1.0,True
5,15,0,1.0,1.0,False
6,30,0,1.0,1.0,False


In [150]:
bp_groups = real_points.groupby(by=["player1_new", "player2_new", "P1BreakPoint", "P2BreakPoint", "PointWinner", "PointServer"])
counts_by_bp = bp_groups.size().reset_index(name="count")

In [151]:
unique_players = set(counts_by_bp["player1_new"].unique().tolist() + counts_by_bp["player2_new"].unique().tolist())
player_bps_dict = dict()
for p in unique_players:
  player_bps_dict[p] = {
    'noserve_nobreak_won': 0,
    'noserve_nobreak_faced': 0,
    'serve_nobreak_won': 0,
    'serve_nobreak_faced': 0,
    'noserve_break_won': 0,
    'noserve_break_faced': 0,
    'serve_break_won': 0,
    'serve_break_faced': 0,
  }

In [152]:
for row in counts_by_bp.to_dict("records"):
  if row["P1BreakPoint"]:
    if row["PointWinner"] == 1:
      player_bps_dict[row["player1_new"]]["noserve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["noserve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["serve_break_faced"] += row["count"]
    else:
      player_bps_dict[row["player2_new"]]["serve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["noserve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["serve_break_faced"] += row["count"]
  elif row["P2BreakPoint"]:
    if row["PointWinner"] == 1:
      player_bps_dict[row["player1_new"]]["serve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["serve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["noserve_break_faced"] += row["count"]
    else:
      player_bps_dict[row["player2_new"]]["noserve_break_won"] += row["count"]
      player_bps_dict[row["player1_new"]]["serve_break_faced"] += row["count"]
      player_bps_dict[row["player2_new"]]["noserve_break_faced"] += row["count"]
  else:
    if row["PointWinner"] == 1:
      if row["PointServer"] == 1:
        player_bps_dict[row["player1_new"]]["serve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["serve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["noserve_nobreak_faced"] += row["count"]
      else:
        player_bps_dict[row["player1_new"]]["noserve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["noserve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["serve_nobreak_faced"] += row["count"]
    else:
      if row["PointServer"] == 1:
        player_bps_dict[row["player2_new"]]["noserve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["serve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["noserve_nobreak_faced"] += row["count"]
      else:
        player_bps_dict[row["player2_new"]]["serve_nobreak_won"] += row["count"]
        player_bps_dict[row["player1_new"]]["noserve_nobreak_faced"] += row["count"]
        player_bps_dict[row["player2_new"]]["serve_nobreak_faced"] += row["count"]

In [153]:
player_bps = pd.DataFrame.from_dict(player_bps_dict, orient="index")

In [154]:
18991 / 85995

0.22083842083842084

In [155]:
player_bps.aggregate("sum")

noserve_nobreak_won      334318
noserve_nobreak_faced    931416
serve_nobreak_won        597098
serve_nobreak_faced      931416
noserve_break_won         34029
noserve_break_faced       85995
serve_break_won           51966
serve_break_faced         85995
dtype: int64

In [156]:
player_bps.sort_values("noserve_break_faced")

Unnamed: 0,noserve_nobreak_won,noserve_nobreak_faced,serve_nobreak_won,serve_nobreak_faced,noserve_break_won,noserve_break_faced,serve_break_won,serve_break_faced
C Niland,6,30,7,24,0,0,1,6
C Altamirano,15,62,30,62,0,0,5,12
B Mott,23,81,43,70,1,1,3,8
M Crugnola,18,64,35,70,1,1,4,11
C Hemery,16,65,25,56,0,1,5,11
...,...,...,...,...,...,...,...,...
K Nishikori,5210,13142,8392,13032,652,1507,677,1127
A Murray,6711,16068,10578,15950,871,1966,745,1203
R Federer,7720,19267,13106,18309,897,2189,644,963
R Nadal,7359,17174,11693,16995,1079,2308,819,1213


In [157]:
# laplace smoothing assuming 64% serve win pct over 100 ghost points. assume 60% for break points
player_bps["serve_nobreak_pct"] = (player_bps["serve_nobreak_won"] + 64) / (player_bps["serve_nobreak_faced"] + 100)
player_bps["serve_break_pct"] = (player_bps["serve_break_won"] + 60) / (player_bps["serve_break_faced"] + 100)
player_bps["noserve_nobreak_pct"] = (player_bps["noserve_nobreak_won"] + 36) / (player_bps["noserve_nobreak_faced"] + 100)
player_bps["noserve_break_pct"] = (player_bps["noserve_break_won"] + 40) / (player_bps["noserve_break_faced"] + 100)

player_bps["serve_choke_pct"] = player_bps["serve_nobreak_pct"] - player_bps["serve_break_pct"] - 0.02 # correcting for 4% difference in averages
player_bps["noserve_choke_pct"] = player_bps["noserve_nobreak_pct"] - player_bps["noserve_break_pct"] + 0.04
player_bps["choke_pct"] = (player_bps["serve_choke_pct"] + player_bps["noserve_choke_pct"]) / 2

In [158]:
player_bps.agg("mean")

noserve_nobreak_won       746.245536
noserve_nobreak_faced    2079.053571
serve_nobreak_won        1332.808036
serve_nobreak_faced      2079.053571
noserve_break_won          75.957589
noserve_break_faced       191.953125
serve_break_won           115.995536
serve_break_faced         191.953125
serve_nobreak_pct           0.616997
serve_break_pct             0.594404
noserve_nobreak_pct         0.342857
noserve_break_pct           0.392082
serve_choke_pct             0.002593
noserve_choke_pct          -0.009225
choke_pct                  -0.003316
dtype: float64

In [159]:
chokiest = player_bps[player_bps["noserve_break_faced"].ge(100)].sort_values("choke_pct")

In [160]:
chokiest.to_csv("../data/chokiest.csv")

In [161]:
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [162]:
px.scatter(data_frame=chokiest, x="noserve_choke_pct", y="serve_choke_pct", hover_name="name")

ValueError: Value of 'hover_name' is not the name of a column in 'data_frame'. Expected one of ['noserve_nobreak_won', 'noserve_nobreak_faced', 'serve_nobreak_won', 'serve_nobreak_faced', 'noserve_break_won', 'noserve_break_faced', 'serve_break_won', 'serve_break_faced', 'serve_nobreak_pct', 'serve_break_pct', 'noserve_nobreak_pct', 'noserve_break_pct', 'serve_choke_pct', 'noserve_choke_pct', 'choke_pct'] but received: name

In [163]:
df = pd.read_csv(f"../data/chokiest - Copy.csv")

In [164]:
df

Unnamed: 0,name,noserve_nobreak_won,noserve_nobreak_faced,serve_nobreak_won,serve_nobreak_faced,noserve_break_won,noserve_break_faced,serve_break_won,serve_break_faced,serve_nobreak_pct,serve_break_pct,noserve_nobreak_pct,noserve_break_pct,serve_choke_pct,noserve_choke_pct,choke_pct
0,D Brown,622,2052,1359,2176,45,110,129,204,0.625220,0.621711,0.305762,0.404762,-0.016491,-0.059000,-0.037745
1,M Jaziri,767,2273,1348,2346,65,171,205,326,0.577269,0.622066,0.338390,0.387454,-0.064797,-0.009064,-0.036930
2,D Koepfer,754,2043,1351,2230,86,194,164,255,0.607296,0.630986,0.368642,0.428571,-0.043690,-0.019929,-0.031810
3,M Giron,512,1564,999,1658,62,139,117,202,0.604664,0.586093,0.329327,0.426778,-0.001428,-0.057451,-0.029440
4,J Vesely,1462,4563,3058,4667,127,317,264,396,0.654919,0.653226,0.321252,0.400480,-0.018307,-0.039227,-0.028767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,R Albot,1226,3279,1876,3132,118,328,179,346,0.600248,0.535874,0.373483,0.369159,0.044373,0.044324,0.044349
190,K Majchrzak,387,1151,681,1088,27,105,71,131,0.627104,0.567100,0.338129,0.326829,0.040005,0.051300,0.045653
191,I Ivashka,457,1221,781,1190,43,118,53,97,0.655039,0.573604,0.373202,0.380734,0.061435,0.032468,0.046951
192,M Fish,986,2519,1622,2386,98,265,98,157,0.678198,0.614786,0.390225,0.378082,0.043412,0.052143,0.047778


In [165]:
fig = px.scatter(data_frame=df, x="noserve_choke_pct", y="choke_pct", hover_name='name')
inds = [62, 77, 85, 149, 154, 156, 158, 168, 171, 177, 185]
fig.data[0].update(selectedpoints=inds,
                   selected=dict(marker=dict(color='red')),
                   unselected=dict(marker=dict(color='blue')))
fig.show()

In [166]:
fig = px.scatter(data_frame=df, x="serve_choke_pct", y="choke_pct", hover_name='name')
inds = [62, 77, 85, 149, 154, 156, 158, 168, 171, 177, 185]
fig.data[0].update(selectedpoints=inds,
                   selected=dict(marker=dict(color='red')),
                   unselected=dict(marker=dict(color='blue')))
fig.show()

In [167]:
[df.iloc[:, 0][i] for i in range(180, 190)]

['D Goffin',
 'D Ferrer',
 'F Auger Aliassime',
 'M Ebden',
 'S Korda',
 'J Martin Del Potro',
 'J Monaco',
 'M Fucsovics',
 'P Petzschner',
 'R Albot']

In [170]:
fig = px.scatter(data_frame=df, x="serve_choke_pct", y="noserve_choke_pct", hover_name='name')
inds = [62, 77, 85, 149, 154, 156, 158, 168, 171, 177, 185]
fig.data[0].update(selectedpoints=inds,
                   selected=dict(marker=dict(color='red')),
                   unselected=dict(marker=dict(color='blue')))
fig.show()
pio.write_image(fig, "../plots/choking.png", engine="auto")