# Serve & Height

This notebook analyzes the relationship between player height and serve effectiveness in professional tennis. Using historical match data, we investigate:

- How does player height affect ace percentage?
- Does height influence 1st and 2nd serve win percentages?
- Do left-handed players have a serve advantage after controlling for height?

The analysis requires `tennis_db.sqlite` in the `examples/` folder. If not already present, download it from the [Kaggle Tennis Dataset](https://www.kaggle.com/datasets/guillemservera/tennis).

#### Reference
This notebook accompanies the blog post: https://medium.com/p/281b663d5cee/edit

In [None]:
import numpy  as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# LOAD DATA FROM SQLITE INTO PANDAS
# dataset dowloaded from https://www.kaggle.com/datasets/guillemservera/tennis

# connect to sqlite database
print("Loading data from the database...")
file_path = "tennis_db.sqlite"
conn = sqlite3.connect(file_path)

# load data into Pandas
df_players = pd.read_sql_query("SELECT * FROM players", conn)   # data about players
df_matches = pd.read_sql_query("SELECT * FROM matches", conn)   # match stats

# filter data: 
# - we keep only the players for which we have height information
# - we keep only the matches for which we have serve statistics
df_playersOK = df_players[df_players["height"].notnull() & df_players["hand"].isin(['L', 'R'])]
df_matchesOK = df_matches[df_matches["w_svpt"].notnull() & df_matches["w_ace"].notnull()]

print(f"Loaded   data about {len(df_players)} players and {len(df_matches)} matches.")
print(f"Filtered data about  {len(df_playersOK)} players and {len(df_matchesOK)} matches.")

In [None]:
# CALCULATE STATS

# filter out players for whom we don't have sufficient data
# this is the minimum amount of serve points for which we need to have data for a player
NUM_SERVES_MIN = 500                 

d_stats = {"player_id": [], "ace_pct": [], "frst_in_pct": [], "frst_won_pct": [], "scnd_won_pct" : [], 
           "height": [], "hand" : []}

# extract stats for one player at a time
num_players = 0
for idx, row in df_playersOK.iterrows():
    player_id = row["player_id"]

    dfw = df_matchesOK[df_matchesOK.winner_id == player_id]   # all the matches won  by the current player
    dfl = df_matchesOK[df_matchesOK.loser_id  == player_id]   # all the matches lost by the current player

    # number of serve points played
    svptW = dfw["w_svpt"].sum()           # during matches the player won     
    svptL = dfl["l_svpt"].sum()           # during matches the player lost    
    svptT = svptW + svptL                 # total number of serve points played by this player

    # number of aces won
    acesW = dfw["w_ace"].sum()            # during matches the player won
    acesL = dfl["l_ace"].sum()            # during matches the player lost
    acesT = acesW + acesL                 # total number of aces for this player

    # number of 1st serves in
    frstInW = dfw["w_1stIn"].sum()        # during matches the player won     
    frstInL = dfl["l_1stIn"].sum()        # during matches the player lost
    frstInT = frstInW + frstInL           # total number of 1st serves in for this player

    # number of 1st serves won
    frstWonW = dfw["w_1stWon"].sum()      # during matches the player won
    frstWonL = dfl["l_1stWon"].sum()      # during matches the player lost
    frstWonT = frstWonW + frstWonL        # total number of 1st serves won by this player

    # number of 2nd serves
    scndW = svptW - frstInW               # during matches the player won
    scndL = svptL - frstInL               # during matches the player lost
    scndT = svptT - frstInT               # total number of 2nd serve points for this player

    # number of 2nd serves won
    scndWonW = dfw["w_2ndWon"].sum()      # during matches the player won
    scndWonL = dfl["l_2ndWon"].sum()      # during matches the player lost
    scndWonT = scndWonW + scndWonL        # total number of 2nd serve points won by this player
    
    if svptW < NUM_SERVES_MIN or \
       svptL < NUM_SERVES_MIN:  
        continue
    num_players += 1
    
    # calculate ace percentages
    acePctW = acesW / svptW               # conditional on player having won  the match
    acePctL = acesL / svptL               # conditional on player having lost the match
    acePctT = acesT / svptT               # unconditional

    # calculate percentage of 1st serve in    
    frstInPctW = frstInW / svptW                            # conditional on player having won  the match
    frstInPctL = frstInL / svptL                            # conditional on player having lost the match
    frstInPctT = frstInT / svptT                            # unconditional
    
    # calculate percentage of 1st serve won (excluding aces)    
    frstWonPctW = (frstWonW - acesW) / (frstInW - acesW)    # conditional on player having won  the match
    frstWonPctL = (frstWonL - acesL) / (frstInL - acesL)    # conditional on player having lost the match
    frstWonPctT = (frstWonT - acesT) / (frstInT - acesT)    # unconditional
    
    # calculate percentage of 2nd serve won    
    scndWonPctW = scndWonW / scndW                          # conditional on player having won  the match
    scndWonPctL = scndWonL / scndL                          # conditional on player having lost the match
    scndWonPctT = scndWonT / scndT                          # unconditional
    
    d_stats["player_id"   ].append(player_id)
    d_stats["ace_pct"     ].append(acePctW)          # condition the stats on the player having won the match
    d_stats["frst_in_pct" ].append(frstInPctW)       # condition the stats on the player having won the match
    d_stats["frst_won_pct"].append(frstWonPctW)      # condition the stats on the player having won the match
    d_stats["scnd_won_pct"].append(scndWonPctW)      # condition the stats on the player having won the match
    d_stats["height"      ].append(row["height"])
    d_stats["hand"        ].append(row["hand"])
    
# load the stats in a dataframe and sort by 'height'
df_stats = pd.DataFrame(d_stats).sort_values(by="height", ascending=True)
print(f"Calculated stats for {num_players} players.")

In [None]:
# PLOT ACE PERCENTAGE AS A FUNCTION OF PLAYER HEIGHT

coeffs = np.polyfit(df_stats["height"], df_stats["ace_pct"], 3)    # fit a cubic 
plt.scatter(df_stats["height"], df_stats["ace_pct"], color='grey', alpha=0.15)
plt.plot(df_stats['height'], np.poly1d(coeffs)(df_stats['height']), color='black', label='cubic fit', linewidth=1)
plt.xlabel("player height (cm)")
plt.ylabel("average ace percentage")
plt.grid(linewidth=0.2)
plt.title("Dependency of Ace Rate on Player Height")
plt.legend()
plt.show()

In [None]:
# PLOT 1st SERVE WIN PERCENTAGE AS A FUNCTION OF PLAYER HEIGHT

coeffs = np.polyfit(df_stats["height"], df_stats["frst_won_pct"], 1)    # fit a line
plt.scatter(df_stats["height"], df_stats["frst_won_pct"], color='forestgreen', alpha=0.15)
plt.plot(df_stats['height'], np.poly1d(coeffs)(df_stats['height']), color='green', label='linear fit', linewidth=1)
plt.xlabel("player height (cm)")
plt.ylabel("average 1st serve win percentage")
plt.grid(linewidth=0.2)
plt.title("Dependency of 1st Serve Win Percentage on Player Height")
plt.legend()
plt.show()

In [None]:
# PLOT 2nd SERVE WIN PERCENTAGE AS A FUNCTION OF PLAYER HEIGHT

coeffs = np.polyfit(df_stats["height"], df_stats["scnd_won_pct"], 1)    # fit a line
plt.scatter(df_stats["height"], df_stats["scnd_won_pct"], color='skyblue', alpha=0.15)
plt.plot(df_stats['height'], np.poly1d(coeffs)(df_stats['height']), color='darkblue', label='linear fit', linewidth=1)
plt.xlabel("player height (cm)")
plt.ylabel("average 2nd serve win percentage")
plt.grid(linewidth=0.2)
plt.title("Dependency of 2nd Serve Win Percentage on Player Height")
plt.legend()
plt.show()

In [None]:
# HANDNESS AS AN ACE ADVANTAGE
# study whether being left handed gives one an ace advantage

# extract the residuals, after controlling 'ace percentage' for height
coeffs_ace = np.polyfit(df_stats["height"], df_stats["ace_pct"], 3)           # same fit as above
resd_ace   = df_stats["ace_pct"] - np.poly1d(coeffs_ace)(df_stats['height'])  # regression residuals
df_stats   = df_stats.assign(residuals_ace=resd_ace)

# plot residuals
df_statsL = df_stats[df_stats["hand"] == 'L']
df_statsR = df_stats[df_stats["hand"] == 'R']
plt.scatter(df_statsL["height"], df_statsL["residuals_ace"], color='red' , alpha=0.7, label="lefties")  # show left hand players in red
plt.scatter(df_statsR["height"], df_statsR["residuals_ace"], color='grey', alpha=0.2, label="righties")
plt.grid(linewidth=0.2)
plt.xlabel("player height (cm)")
plt.ylabel("residual average ace percentage")
plt.title("Residual Ace Rate after Controlling for Height")
plt.legend()

# fit a line through the residuals
coeffs_resd_ace = np.polyfit(df_stats["height"], resd_ace, 1) 
plt.plot(df_stats['height'], np.poly1d(coeffs_resd_ace)(df_stats['height']), color='black', label='cubic fit', linewidth=1)
plt.show()

# Calculate sample means and run a t-test to decide whether they are different

print(f"lefties  mean: {df_statsL["residuals_ace"].mean():+.4f}")
print(f"righties mean: {df_statsR["residuals_ace"].mean():+.4f}")

t_stat, p_val = stats.ttest_ind(df_statsL["residuals_ace"], df_statsR["residuals_ace"])
print(f"t-statistics:   {t_stat:.2f}")
print(f"p-value:        {p_val:.2f}")

In [None]:
# HANDNESS AS AN 1st SERVE ADVANTAGE
# study whether being left handed gives one a 1st serve advantage

# extract the residuals, after controlling '1st serve win percentage' for height
coeffs_1st = np.polyfit(df_stats["height"], df_stats["frst_won_pct"], 1)           # same fit as above
resd_1st   = df_stats["frst_won_pct"] - np.poly1d(coeffs_1st)(df_stats['height'])  # regression residuals
df_stats   = df_stats.assign(residuals_1st=resd_1st)

# plot residuals
df_statsL = df_stats[df_stats["hand"] == 'L']
df_statsR = df_stats[df_stats["hand"] == 'R']
plt.scatter(df_statsL["height"], df_statsL["residuals_1st"], color='red' , alpha=0.7, label="lefties")  # show left hand players in red
plt.scatter(df_statsR["height"], df_statsR["residuals_1st"], color='grey', alpha=0.2, label="righties")
plt.grid(linewidth=0.2)
plt.xlabel("player height (cm)")
plt.ylabel("residual 1st serve win percentage")
plt.title("Residual 1st Serve Win Percentage after Controlling for Height")
plt.legend()

# fit a line through the residuals
coeffs_resd_1st = np.polyfit(df_stats["height"], resd_1st, 1) 
plt.plot(df_stats['height'], np.poly1d(coeffs_resd_1st)(df_stats['height']), color='black', label='cubic fit', linewidth=1)
plt.show()

# Calculate sample means and run a t-test to decide whether they are different

print(f"lefties  mean: {df_statsL["residuals_1st"].mean():+.4f}")
print(f"righties mean: {df_statsR["residuals_1st"].mean():+.4f}")

t_stat, p_val = stats.ttest_ind(df_statsL["residuals_1st"], df_statsR["residuals_1st"])
print(f"t-statistics:   {t_stat:.2f}")
print(f"p-value:        {p_val:.2f}")

In [None]:
# HANDNESS AS A 2nd SERVE ADVANTAGE
# study whether being left handed gives one a 2nd serve advantage

# extract the residuals, after controlling '2nd serve win percentage' for height
coeffs_2nd = np.polyfit(df_stats["height"], df_stats["scnd_won_pct"], 1)           # same fit as above
resd_2nd   = df_stats["scnd_won_pct"] - np.poly1d(coeffs_2nd)(df_stats['height'])  # regression residuals
df_stats   = df_stats.assign(residuals_2nd=resd_2nd)

# plot residuals
df_statsL = df_stats[df_stats["hand"] == 'L']
df_statsR = df_stats[df_stats["hand"] == 'R']
plt.scatter(df_statsL["height"], df_statsL["residuals_2nd"], color='red' , alpha=0.7, label="lefties")  # show left hand players in red
plt.scatter(df_statsR["height"], df_statsR["residuals_2nd"], color='grey', alpha=0.2, label="righties")
plt.grid(linewidth=0.2)
plt.xlabel("player height (cm)")
plt.ylabel("residual 2nd serve win percentage")
plt.title("Residual 2nd Serve Win Percentage after Controlling for Height")
plt.legend()

# fit a line through the residuals
coeffs_resd_2nd = np.polyfit(df_stats["height"], resd_2nd, 1) 
plt.plot(df_stats['height'], np.poly1d(coeffs_resd_2nd)(df_stats['height']), color='black', label='cubic fit', linewidth=1)
plt.show()

# Calculate sample means and run a t-test to decide whether they are different

print(f"lefties  mean: {df_statsL["residuals_2nd"].mean():+.4f}")
print(f"righties mean: {df_statsR["residuals_2nd"].mean():+.4f}")

t_stat, p_val = stats.ttest_ind(df_statsL["residuals_2nd"], df_statsR["residuals_2nd"])
print(f"t-statistics:   {t_stat:.2f}")
print(f"p-value:        {p_val:.2f}")

In [None]:
# COMBINE ALL DATA
# instead of running 3 separate t-tests (on ace rate, 1st server win, 2nd serve win), combine *all*
# serve statistics in one dataset and test whether the averages are different for lefties vs righties

# combine all stats for lefties
df1L = df_statsL[['hand', 'residuals_ace']].rename(columns={'residuals_ace': 'residuals'})
df2L = df_statsL[['hand', 'residuals_1st']].rename(columns={'residuals_1st': 'residuals'})
df3L = df_statsL[['hand', 'residuals_2nd']].rename(columns={'residuals_2nd': 'residuals'})
df_statsL_all = pd.concat([df1L, df2L, df3L], axis=0)

# combine all stats for righties
df1R = df_statsR[['hand', 'residuals_ace']].rename(columns={'residuals_ace': 'residuals'})
df2R = df_statsR[['hand', 'residuals_1st']].rename(columns={'residuals_1st': 'residuals'})
df3R = df_statsR[['hand', 'residuals_2nd']].rename(columns={'residuals_2nd': 'residuals'})
df_statsR_all = pd.concat([df1R, df2R, df3R], axis=0)

# Calculate sample means and run a t-test to decide whether they are different

print(f"lefties  mean: {df_statsL_all["residuals"].mean():+.4f}")
print(f"righties mean: {df_statsR_all["residuals"].mean():+.4f}")

t_stat, p_val = stats.ttest_ind(df_statsL_all["residuals"], df_statsR_all["residuals"])
print(f"t-statistics:   {t_stat:.2f}")
print(f"p-value:        {p_val:.2f}")

In [None]:
# THIS IS A REPEAT OF THE THREE PLOTS ABOVE
# we plot them all together for the purpose of publishing

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 10)) 

# plot residuals
df_statsL = df_stats[df_stats["hand"] == 'L']
df_statsR = df_stats[df_stats["hand"] == 'R']
ax1.scatter(df_statsL["height"], df_statsL["residuals_ace"], color='red' , alpha=0.7, label="lefties")  # show left hand players in red
ax1.scatter(df_statsR["height"], df_statsR["residuals_ace"], color='grey', alpha=0.2, label="righties")
ax1.grid(linewidth=0.2)
ax1.set_xlabel("player height (cm)")
ax1.set_ylabel("residual percentage")
ax1.set_title("Residual Ace Rate")
ax1.legend()

# fit a line through the residuals
coeffs_resd_ace = np.polyfit(df_stats["height"], resd_ace, 1) 
ax1.plot(df_stats['height'], np.poly1d(coeffs_resd_ace)(df_stats['height']), color='black', label='cubic fit', linewidth=1)

# ------------------------------------------ #

# plot residuals
df_statsL = df_stats[df_stats["hand"] == 'L']
df_statsR = df_stats[df_stats["hand"] == 'R']
ax2.scatter(df_statsL["height"], df_statsL["residuals_1st"], color='red' , alpha=0.7, label="lefties")  # show left hand players in red
ax2.scatter(df_statsR["height"], df_statsR["residuals_1st"], color='grey', alpha=0.2, label="righties")
ax2.grid(linewidth=0.2)
ax2.set_xlabel("player height (cm)")
#ax2.set_ylabel("residual 1st serve win percentage")
ax2.set_title("Residual 1st Serve Win Percentage")
ax2.legend()

# fit a line through the residuals
coeffs_resd_1st = np.polyfit(df_stats["height"], resd_1st, 1) 
ax2.plot(df_stats['height'], np.poly1d(coeffs_resd_1st)(df_stats['height']), color='black', label='cubic fit', linewidth=1)

# ------------------------------------------ #

# plot residuals
df_statsL = df_stats[df_stats["hand"] == 'L']
df_statsR = df_stats[df_stats["hand"] == 'R']
ax3.scatter(df_statsL["height"], df_statsL["residuals_2nd"], color='red' , alpha=0.7, label="lefties")  # show left hand players in red
ax3.scatter(df_statsR["height"], df_statsR["residuals_2nd"], color='grey', alpha=0.2, label="righties")
ax3.grid(linewidth=0.2)
ax3.set_xlabel("player height (cm)")
ax3.set_ylabel("residual percentage")
ax3.set_title("Residual 2nd Serve Win Percentage")
ax3.legend()

# fit a line through the residuals
coeffs_resd_2nd = np.polyfit(df_stats["height"], resd_2nd, 1) 
ax3.plot(df_stats['height'], np.poly1d(coeffs_resd_2nd)(df_stats['height']), color='black', label='cubic fit', linewidth=1)

plt.show()

In [None]:
# Manually assemble some output from cells above for the purpose of publishing

d = {"serve quality indicator": ["ace rate", "1st serve won %", "2nd serve won %", "all data"], 
     "lefties avg"  : [0.0015, 0.0011, 0.0026, 0.0017],
     "righties avg" : [-0.0002, -0.0001, -0.0004, -0.0003],
     "t-statistics": [0.72, 0.67, 1.60, 1.68], 
     "p-value": [0.47, 0.50, 0.11, 0.09]}
pd.DataFrame(d)