# Tennis Point-by-Point Analysis: Serve & Return Stats

This notebook parses the point-by-point data to calculate serve and return winning percentages for players. These stats form the foundation for our match simulation model.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')

## 1. Load the Data

In [2]:
data_dir = Path('../data/raw/tennis_pointbypoint')

# Load ATP main draw matches (archive + current)
atp_archive = pd.read_csv(data_dir / 'pbp_matches_atp_main_archive.csv')
atp_current = pd.read_csv(data_dir / 'pbp_matches_atp_main_current.csv')
atp = pd.concat([atp_archive, atp_current], ignore_index=True)

# Load WTA main draw matches
wta_archive = pd.read_csv(data_dir / 'pbp_matches_wta_main_archive.csv')
wta_current = pd.read_csv(data_dir / 'pbp_matches_wta_main_current.csv')
wta = pd.concat([wta_archive, wta_current], ignore_index=True)

print(f"ATP matches: {len(atp):,}")
print(f"WTA matches: {len(wta):,}")
print(f"Total: {len(atp) + len(wta):,}")

ATP matches: 13,050
WTA matches: 12,695
Total: 25,745


In [3]:
# Preview the data
atp.head(3)

Unnamed: 0,pbp_id,date,tny_name,tour,draw,server1,server2,winner,pbp,score,adf_flag,wh_minutes
0,2231275,28 Jul 11,ATPStudenaCroatiaOpen-ATPUmag2011,ATP,Main,Olivier Rochus,Fabio Fognini,2,SSSS;RRRR;SSRRSS;SSRRSS;RSRSRSRR;SSRSS;RSRRSR;...,6-4 6-1,0,66
1,2231276,28 Jul 11,ATPStudenaCroatiaOpen-ATPUmag2011,ATP,Main,Robin Haase,Marin Cilic,2,SSRSS;RRSSRSSS;SSSS;RSSSS;SRSRSS;RSRSRSSS;RSRS...,4-6 6-4 6-3,0,141
2,2236280,29 Jul 11,ATPStudenaCroatiaOpen-ATPUmag2011,ATP,Main,Marin Cilic,Andreas Seppi,1,SSSS;SRRRR;SSRRRSSS;RSRRSSSS;RSRSSS;SRRRR;SSRS...,6-1 6-3,0,71


## 2. Parse Point-by-Point Sequences

The `pbp` field encodes each point:
- `S` = server won the point
- `R` = returner won the point  
- `A` = ace (server won)
- `D` = double fault (returner won)
- `;` = game delimiter
- `.` = set delimiter
- `/` = tiebreak serve change

In [4]:
@dataclass
class MatchStats:
    """Stats extracted from a single match for both players."""
    server1: str
    server2: str
    s1_serve_points: int = 0
    s1_serve_won: int = 0
    s2_serve_points: int = 0
    s2_serve_won: int = 0
    s1_aces: int = 0
    s1_dfs: int = 0
    s2_aces: int = 0
    s2_dfs: int = 0


def parse_pbp(pbp: str, server1: str, server2: str) -> MatchStats:
    """
    Parse a point-by-point string and return serve/return stats.
    
    Server alternates each game. In tiebreaks, '/' indicates serve changes
    (every 2 points after the first).
    """
    stats = MatchStats(server1=server1, server2=server2)
    
    # server1 always serves first
    current_server = 1  # 1 or 2
    in_tiebreak = False
    tiebreak_points = 0
    
    i = 0
    while i < len(pbp):
        char = pbp[i]
        
        if char in 'SRAD':
            # It's a point
            server_won = char in 'SA'
            is_ace = char == 'A'
            is_df = char == 'D'
            
            if current_server == 1:
                stats.s1_serve_points += 1
                if server_won:
                    stats.s1_serve_won += 1
                if is_ace:
                    stats.s1_aces += 1
                if is_df:
                    stats.s1_dfs += 1
            else:
                stats.s2_serve_points += 1
                if server_won:
                    stats.s2_serve_won += 1
                if is_ace:
                    stats.s2_aces += 1
                if is_df:
                    stats.s2_dfs += 1
            
            if in_tiebreak:
                tiebreak_points += 1
        
        elif char == ';':
            # Game over - switch server (unless in tiebreak)
            if not in_tiebreak:
                current_server = 2 if current_server == 1 else 1
        
        elif char == '.':
            # Set over - switch server, reset tiebreak
            current_server = 2 if current_server == 1 else 1
            in_tiebreak = False
            tiebreak_points = 0
        
        elif char == '/':
            # Tiebreak serve change
            in_tiebreak = True
            current_server = 2 if current_server == 1 else 1
        
        i += 1
    
    return stats


# Test with a sample match
sample = atp.iloc[0]
stats = parse_pbp(sample['pbp'], sample['server1'], sample['server2'])
print(f"Match: {stats.server1} vs {stats.server2}")
print(f"{stats.server1}: {stats.s1_serve_won}/{stats.s1_serve_points} serve points won ({100*stats.s1_serve_won/stats.s1_serve_points:.1f}%)")
print(f"{stats.server2}: {stats.s2_serve_won}/{stats.s2_serve_points} serve points won ({100*stats.s2_serve_won/stats.s2_serve_points:.1f}%)")

Match: Olivier Rochus vs Fabio Fognini
Olivier Rochus: 27/58 serve points won (46.6%)
Fabio Fognini: 32/56 serve points won (57.1%)


## 3. Calculate Stats for All Matches

In [5]:
def process_matches(df: pd.DataFrame, tour_name: str) -> list[MatchStats]:
    """Process all matches in a dataframe."""
    all_stats = []
    errors = 0
    
    for _, row in df.iterrows():
        try:
            stats = parse_pbp(row['pbp'], row['server1'], row['server2'])
            all_stats.append(stats)
        except Exception as e:
            errors += 1
    
    print(f"{tour_name}: Processed {len(all_stats):,} matches ({errors} errors)")
    return all_stats

atp_stats = process_matches(atp, "ATP")
wta_stats = process_matches(wta, "WTA")

ATP: Processed 13,050 matches (0 errors)


WTA: Processed 12,695 matches (0 errors)


## 4. Aggregate Player-Level Statistics

In [6]:
def aggregate_player_stats(match_stats: list[MatchStats]) -> pd.DataFrame:
    """
    Aggregate match-level stats into player-level serve/return percentages.
    """
    player_data = defaultdict(lambda: {
        'serve_points': 0,
        'serve_won': 0,
        'return_points': 0,
        'return_won': 0,
        'aces': 0,
        'dfs': 0,
        'matches': 0
    })
    
    for stats in match_stats:
        # Player 1 (server1)
        p1 = player_data[stats.server1]
        p1['serve_points'] += stats.s1_serve_points
        p1['serve_won'] += stats.s1_serve_won
        p1['return_points'] += stats.s2_serve_points  # opponent's serve = their return
        p1['return_won'] += stats.s2_serve_points - stats.s2_serve_won
        p1['aces'] += stats.s1_aces
        p1['dfs'] += stats.s1_dfs
        p1['matches'] += 1
        
        # Player 2 (server2)
        p2 = player_data[stats.server2]
        p2['serve_points'] += stats.s2_serve_points
        p2['serve_won'] += stats.s2_serve_won
        p2['return_points'] += stats.s1_serve_points
        p2['return_won'] += stats.s1_serve_points - stats.s1_serve_won
        p2['aces'] += stats.s2_aces
        p2['dfs'] += stats.s2_dfs
        p2['matches'] += 1
    
    # Convert to DataFrame
    rows = []
    for player, data in player_data.items():
        serve_pct = data['serve_won'] / data['serve_points'] if data['serve_points'] > 0 else 0
        return_pct = data['return_won'] / data['return_points'] if data['return_points'] > 0 else 0
        ace_pct = data['aces'] / data['serve_points'] if data['serve_points'] > 0 else 0
        df_pct = data['dfs'] / data['serve_points'] if data['serve_points'] > 0 else 0
        
        rows.append({
            'player': player,
            'matches': data['matches'],
            'serve_points': data['serve_points'],
            'serve_won': data['serve_won'],
            'serve_pct': serve_pct,
            'return_points': data['return_points'],
            'return_won': data['return_won'],
            'return_pct': return_pct,
            'aces': data['aces'],
            'ace_pct': ace_pct,
            'dfs': data['dfs'],
            'df_pct': df_pct
        })
    
    return pd.DataFrame(rows)

atp_players = aggregate_player_stats(atp_stats)
wta_players = aggregate_player_stats(wta_stats)

print(f"ATP: {len(atp_players)} unique players")
print(f"WTA: {len(wta_players)} unique players")

ATP: 1049 unique players
WTA: 979 unique players


## 5. Explore ATP Player Stats

In [7]:
# Filter to players with at least 20 matches for reliable stats
atp_reliable = atp_players[atp_players['matches'] >= 20].copy()
print(f"ATP players with 20+ matches: {len(atp_reliable)}")

# Top 10 by serve percentage
print("\n=== Top 10 ATP Servers (by serve points won %) ===")
atp_reliable.nlargest(10, 'serve_pct')[['player', 'matches', 'serve_pct', 'ace_pct']].round(3)

ATP players with 20+ matches: 218

=== Top 10 ATP Servers (by serve points won %) ===


Unnamed: 0,player,matches,serve_pct,ace_pct
62,Ivo Karlovic,201,0.732,0.175
68,John Isner,291,0.717,0.131
175,Milos Raonic,275,0.709,0.108
91,Roger Federer,360,0.703,0.06
219,Samuel Groth,63,0.7,0.152
27,Gilles Muller,166,0.683,0.113
82,Jo-Wilfried Tsonga,282,0.682,0.062
92,Novak Djokovic,365,0.682,0.042
712,Nick Kyrgios,94,0.681,0.118
75,Kevin Anderson,278,0.68,0.1


In [8]:
# Top 10 by return percentage
print("=== Top 10 ATP Returners (by return points won %) ===")
atp_reliable.nlargest(10, 'return_pct')[['player', 'matches', 'return_pct', 'serve_pct']].round(3)

=== Top 10 ATP Returners (by return points won %) ===


Unnamed: 0,player,matches,return_pct,serve_pct
840,Diego Schwartzman,55,0.441,0.589
249,Matteo Viola,26,0.424,0.573
101,Rafael Nadal,340,0.421,0.67
92,Novak Djokovic,365,0.42,0.682
102,David Ferrer,354,0.418,0.639
87,Andy Murray,333,0.417,0.655
80,Flavio Cipolla,46,0.41,0.563
126,Alessandro Giannessi,22,0.409,0.583
77,Juan Monaco,184,0.406,0.608
770,Damir Dzumhur,57,0.399,0.596


In [9]:
# Distribution summary
print("=== ATP Serve/Return Distribution (20+ matches) ===")
atp_reliable[['serve_pct', 'return_pct', 'ace_pct', 'df_pct']].describe().round(3)

=== ATP Serve/Return Distribution (20+ matches) ===


Unnamed: 0,serve_pct,return_pct,ace_pct,df_pct
count,218.0,218.0,218.0,218.0
mean,0.628,0.362,0.048,0.025
std,0.028,0.024,0.027,0.01
min,0.563,0.275,0.0,0.0
25%,0.609,0.349,0.032,0.019
50%,0.627,0.363,0.044,0.026
75%,0.645,0.374,0.06,0.032
max,0.732,0.441,0.175,0.054


## 6. Explore WTA Player Stats

In [10]:
wta_reliable = wta_players[wta_players['matches'] >= 20].copy()
print(f"WTA players with 20+ matches: {len(wta_reliable)}")

print("\n=== Top 10 WTA Servers ===")
wta_reliable.nlargest(10, 'serve_pct')[['player', 'matches', 'serve_pct', 'ace_pct']].round(3)

WTA players with 20+ matches: 243

=== Top 10 WTA Servers ===


Unnamed: 0,player,matches,serve_pct,ace_pct
11,Serena Williams,271,0.658,0.07
189,Kim Clijsters,20,0.621,0.0
356,Naomi Broady,28,0.612,0.101
172,Karolina Pliskova,232,0.612,0.078
33,Coco Vandeweghe,142,0.61,0.073
38,Samantha Stosur,277,0.607,0.037
41,Lucie Safarova,250,0.607,0.044
97,Madison Keys,162,0.606,0.059
55,Maria Sharapova,230,0.604,0.031
71,Akgul Amanmuradova,30,0.604,0.019


In [11]:
print("=== Top 10 WTA Returners ===")
wta_reliable.nlargest(10, 'return_pct')[['player', 'matches', 'return_pct', 'serve_pct']].round(3)

=== Top 10 WTA Returners ===


Unnamed: 0,player,matches,return_pct,serve_pct
29,Sara Errani,296,0.496,0.536
1,Victoria Azarenka,212,0.489,0.589
11,Serena Williams,271,0.485,0.658
55,Maria Sharapova,230,0.483,0.604
40,Klara Zakopalova,110,0.482,0.524
3,Agnieszka Radwanska,339,0.479,0.587
271,Yvonne Meusburger,63,0.479,0.515
70,Na Li,150,0.478,0.587
35,Simona Halep,275,0.477,0.576
529,Jelena Ostapenko,60,0.475,0.54


In [12]:
print("=== WTA Serve/Return Distribution (20+ matches) ===")
wta_reliable[['serve_pct', 'return_pct', 'ace_pct', 'df_pct']].describe().round(3)

=== WTA Serve/Return Distribution (20+ matches) ===


Unnamed: 0,serve_pct,return_pct,ace_pct,df_pct
count,243.0,243.0,243.0,243.0
mean,0.555,0.436,0.024,0.037
std,0.025,0.021,0.016,0.016
min,0.492,0.343,0.0,0.0
25%,0.537,0.421,0.012,0.028
50%,0.555,0.434,0.019,0.036
75%,0.571,0.449,0.031,0.045
max,0.658,0.496,0.101,0.108


## 7. ATP vs WTA Comparison

In [13]:
print("=== Tour Comparison (players with 20+ matches) ===")
comparison = pd.DataFrame({
    'ATP': atp_reliable[['serve_pct', 'return_pct', 'ace_pct', 'df_pct']].mean(),
    'WTA': wta_reliable[['serve_pct', 'return_pct', 'ace_pct', 'df_pct']].mean()
}).round(3)
comparison

=== Tour Comparison (players with 20+ matches) ===


Unnamed: 0,ATP,WTA
serve_pct,0.628,0.555
return_pct,0.362,0.436
ace_pct,0.048,0.024
df_pct,0.025,0.037


## 8. Key Insight: Serve + Return = Dominance

A player's combined serve% + return% is a strong indicator of overall ability.

In [14]:
atp_reliable['combined'] = atp_reliable['serve_pct'] + atp_reliable['return_pct']

print("=== Top 15 ATP by Combined Serve + Return % ===")
atp_reliable.nlargest(15, 'combined')[['player', 'matches', 'serve_pct', 'return_pct', 'combined']].round(3)

=== Top 15 ATP by Combined Serve + Return % ===


Unnamed: 0,player,matches,serve_pct,return_pct,combined
92,Novak Djokovic,365,0.682,0.42,1.102
91,Roger Federer,360,0.703,0.391,1.094
101,Rafael Nadal,340,0.67,0.421,1.091
87,Andy Murray,333,0.655,0.417,1.072
102,David Ferrer,354,0.639,0.418,1.057
90,Tomas Berdych,365,0.669,0.379,1.048
266,Roberto Bautista Agut,67,0.647,0.398,1.045
81,Juan Martin del Potro,185,0.671,0.373,1.045
82,Jo-Wilfried Tsonga,282,0.682,0.361,1.043
97,Kei Nishikori,268,0.646,0.396,1.042


In [15]:
wta_reliable['combined'] = wta_reliable['serve_pct'] + wta_reliable['return_pct']

print("=== Top 15 WTA by Combined Serve + Return % ===")
wta_reliable.nlargest(15, 'combined')[['player', 'matches', 'serve_pct', 'return_pct', 'combined']].round(3)

=== Top 15 WTA by Combined Serve + Return % ===


Unnamed: 0,player,matches,serve_pct,return_pct,combined
11,Serena Williams,271,0.658,0.485,1.143
189,Kim Clijsters,20,0.621,0.466,1.087
55,Maria Sharapova,230,0.604,0.483,1.087
1,Victoria Azarenka,212,0.589,0.489,1.079
3,Agnieszka Radwanska,339,0.587,0.479,1.066
70,Na Li,150,0.587,0.478,1.064
30,Ana Ivanovic,238,0.59,0.466,1.056
53,Caroline Wozniacki,324,0.587,0.469,1.055
35,Simona Halep,275,0.576,0.477,1.053
102,Venus Williams,193,0.591,0.457,1.048


## 9. Save Processed Stats

In [16]:
# Save to processed data folder
output_dir = Path('../data/processed')
output_dir.mkdir(exist_ok=True)

atp_players.to_csv(output_dir / 'atp_player_stats.csv', index=False)
wta_players.to_csv(output_dir / 'wta_player_stats.csv', index=False)

print(f"Saved ATP stats: {len(atp_players)} players")
print(f"Saved WTA stats: {len(wta_players)} players")

Saved ATP stats: 1049 players
Saved WTA stats: 979 players


## Next Steps

With these serve/return percentages, we can now:

1. **Build a point probability model**: Given player A serving vs player B, estimate P(A wins point)
2. **Create a match simulator**: Use the point model to simulate matches point-by-point
3. **Add surface adjustments**: Calculate surface-specific serve/return stats
4. **Compare to betting lines**: Convert model probabilities to implied odds