# Tennis Game Style Trends: Year-over-Year Analysis

Analyzing whether serve/return dynamics have shifted over time.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Parse Data

In [2]:
data_dir = Path('../data/raw/tennis_pointbypoint')

# Load all ATP and WTA main draw matches
atp = pd.concat([
    pd.read_csv(data_dir / 'pbp_matches_atp_main_archive.csv'),
    pd.read_csv(data_dir / 'pbp_matches_atp_main_current.csv')
], ignore_index=True)

wta = pd.concat([
    pd.read_csv(data_dir / 'pbp_matches_wta_main_archive.csv'),
    pd.read_csv(data_dir / 'pbp_matches_wta_main_current.csv')
], ignore_index=True)

# Parse year from date (format: "28 Jul 11" -> 2011)
def parse_year(date_str):
    try:
        year_2digit = int(date_str.split()[-1])
        return 2000 + year_2digit if year_2digit < 50 else 1900 + year_2digit
    except:
        return None

atp['year'] = atp['date'].apply(parse_year)
wta['year'] = wta['date'].apply(parse_year)

print("ATP year range:", atp['year'].min(), "-", atp['year'].max())
print("WTA year range:", wta['year'].min(), "-", wta['year'].max())
print("\nMatches per year (ATP):")
print(atp['year'].value_counts().sort_index())

ATP year range: 2011 - 2017
WTA year range: 2011 - 2017

Matches per year (ATP):
year
2011     659
2012    2611
2013    2633
2014    2429
2015    2570
2017    2148
Name: count, dtype: int64


In [3]:
@dataclass
class MatchStats:
    """Stats extracted from a single match."""
    total_points: int = 0
    serve_won: int = 0
    aces: int = 0
    dfs: int = 0

def parse_pbp_aggregate(pbp: str) -> MatchStats:
    """
    Parse pbp and return aggregate stats (not player-specific).
    """
    stats = MatchStats()
    
    for char in pbp:
        if char in 'SRAD':
            stats.total_points += 1
            if char in 'SA':
                stats.serve_won += 1
            if char == 'A':
                stats.aces += 1
            if char == 'D':
                stats.dfs += 1
    
    return stats

# Test
test_stats = parse_pbp_aggregate(atp.iloc[0]['pbp'])
print(f"Test match: {test_stats.total_points} points, {test_stats.serve_won} serve wins, {test_stats.aces} aces, {test_stats.dfs} DFs")

Test match: 114 points, 59 serve wins, 0 aces, 0 DFs


## 2. Calculate Year-by-Year Stats

In [4]:
def calculate_yearly_stats(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate aggregate serve/return stats by year.
    """
    yearly_data = defaultdict(lambda: {
        'matches': 0,
        'total_points': 0,
        'serve_won': 0,
        'aces': 0,
        'dfs': 0
    })
    
    for _, row in df.iterrows():
        year = row['year']
        if pd.isna(year):
            continue
        
        stats = parse_pbp_aggregate(row['pbp'])
        
        yearly_data[int(year)]['matches'] += 1
        yearly_data[int(year)]['total_points'] += stats.total_points
        yearly_data[int(year)]['serve_won'] += stats.serve_won
        yearly_data[int(year)]['aces'] += stats.aces
        yearly_data[int(year)]['dfs'] += stats.dfs
    
    rows = []
    for year, data in sorted(yearly_data.items()):
        serve_pct = data['serve_won'] / data['total_points'] if data['total_points'] > 0 else 0
        return_pct = 1 - serve_pct  # return won = points not won by server
        ace_pct = data['aces'] / data['total_points'] if data['total_points'] > 0 else 0
        df_pct = data['dfs'] / data['total_points'] if data['total_points'] > 0 else 0
        points_per_match = data['total_points'] / data['matches'] if data['matches'] > 0 else 0
        
        rows.append({
            'year': year,
            'matches': data['matches'],
            'total_points': data['total_points'],
            'serve_pct': serve_pct,
            'return_pct': return_pct,
            'ace_pct': ace_pct,
            'df_pct': df_pct,
            'points_per_match': points_per_match
        })
    
    return pd.DataFrame(rows)

atp_yearly = calculate_yearly_stats(atp)
wta_yearly = calculate_yearly_stats(wta)

print("ATP yearly stats calculated")
print("WTA yearly stats calculated")

ATP yearly stats calculated
WTA yearly stats calculated


## 3. ATP Year-over-Year Trends

In [5]:
print("=== ATP Year-by-Year Stats ===")
print()
atp_display = atp_yearly.copy()
atp_display['serve_pct'] = (atp_display['serve_pct'] * 100).round(2)
atp_display['return_pct'] = (atp_display['return_pct'] * 100).round(2)
atp_display['ace_pct'] = (atp_display['ace_pct'] * 100).round(2)
atp_display['df_pct'] = (atp_display['df_pct'] * 100).round(2)
atp_display['points_per_match'] = atp_display['points_per_match'].round(1)

atp_display[['year', 'matches', 'serve_pct', 'return_pct', 'ace_pct', 'df_pct', 'points_per_match']]

=== ATP Year-by-Year Stats ===



Unnamed: 0,year,matches,serve_pct,return_pct,ace_pct,df_pct,points_per_match
0,2011,659,62.74,37.26,0.01,0.0,147.3
1,2012,2611,63.12,36.88,1.57,0.9,154.5
2,2013,2633,63.36,36.64,4.69,2.4,159.8
3,2014,2429,63.84,36.16,6.28,2.98,161.8
4,2015,2570,64.15,35.85,7.45,3.24,163.0
5,2017,2148,63.56,36.44,7.11,3.68,164.3


In [6]:
# Calculate change from first full year to last full year
atp_full_years = atp_yearly[(atp_yearly['matches'] >= 500)]
if len(atp_full_years) >= 2:
    first_year = atp_full_years.iloc[0]
    last_year = atp_full_years.iloc[-1]
    
    print(f"=== ATP Change: {int(first_year['year'])} → {int(last_year['year'])} ===")
    print(f"Serve %:   {first_year['serve_pct']*100:.2f}% → {last_year['serve_pct']*100:.2f}% ({(last_year['serve_pct'] - first_year['serve_pct'])*100:+.2f}%)")
    print(f"Return %:  {first_year['return_pct']*100:.2f}% → {last_year['return_pct']*100:.2f}% ({(last_year['return_pct'] - first_year['return_pct'])*100:+.2f}%)")
    print(f"Ace %:     {first_year['ace_pct']*100:.2f}% → {last_year['ace_pct']*100:.2f}% ({(last_year['ace_pct'] - first_year['ace_pct'])*100:+.2f}%)")
    print(f"DF %:      {first_year['df_pct']*100:.2f}% → {last_year['df_pct']*100:.2f}% ({(last_year['df_pct'] - first_year['df_pct'])*100:+.2f}%)")
    print(f"Pts/Match: {first_year['points_per_match']:.1f} → {last_year['points_per_match']:.1f} ({last_year['points_per_match'] - first_year['points_per_match']:+.1f})")

=== ATP Change: 2011 → 2017 ===
Serve %:   62.74% → 63.56% (+0.82%)
Return %:  37.26% → 36.44% (-0.82%)
Ace %:     0.01% → 7.11% (+7.10%)
DF %:      0.00% → 3.68% (+3.68%)
Pts/Match: 147.3 → 164.3 (+17.1)


## 4. WTA Year-over-Year Trends

In [7]:
print("=== WTA Year-by-Year Stats ===")
print()
wta_display = wta_yearly.copy()
wta_display['serve_pct'] = (wta_display['serve_pct'] * 100).round(2)
wta_display['return_pct'] = (wta_display['return_pct'] * 100).round(2)
wta_display['ace_pct'] = (wta_display['ace_pct'] * 100).round(2)
wta_display['df_pct'] = (wta_display['df_pct'] * 100).round(2)
wta_display['points_per_match'] = wta_display['points_per_match'].round(1)

wta_display[['year', 'matches', 'serve_pct', 'return_pct', 'ace_pct', 'df_pct', 'points_per_match']]

=== WTA Year-by-Year Stats ===



Unnamed: 0,year,matches,serve_pct,return_pct,ace_pct,df_pct,points_per_match
0,2011,456,56.17,43.83,0.0,0.0,133.1
1,2012,2475,55.78,44.22,0.83,1.46,138.6
2,2013,2616,55.66,44.34,2.3,3.75,141.2
3,2014,2499,56.06,43.94,3.04,4.36,143.0
4,2015,2588,56.24,43.76,3.48,4.65,144.4
5,2017,2061,56.53,43.47,3.43,4.63,144.4


In [8]:
# Calculate change from first full year to last full year
wta_full_years = wta_yearly[(wta_yearly['matches'] >= 500)]
if len(wta_full_years) >= 2:
    first_year = wta_full_years.iloc[0]
    last_year = wta_full_years.iloc[-1]
    
    print(f"=== WTA Change: {int(first_year['year'])} → {int(last_year['year'])} ===")
    print(f"Serve %:   {first_year['serve_pct']*100:.2f}% → {last_year['serve_pct']*100:.2f}% ({(last_year['serve_pct'] - first_year['serve_pct'])*100:+.2f}%)")
    print(f"Return %:  {first_year['return_pct']*100:.2f}% → {last_year['return_pct']*100:.2f}% ({(last_year['return_pct'] - first_year['return_pct'])*100:+.2f}%)")
    print(f"Ace %:     {first_year['ace_pct']*100:.2f}% → {last_year['ace_pct']*100:.2f}% ({(last_year['ace_pct'] - first_year['ace_pct'])*100:+.2f}%)")
    print(f"DF %:      {first_year['df_pct']*100:.2f}% → {last_year['df_pct']*100:.2f}% ({(last_year['df_pct'] - first_year['df_pct'])*100:+.2f}%)")
    print(f"Pts/Match: {first_year['points_per_match']:.1f} → {last_year['points_per_match']:.1f} ({last_year['points_per_match'] - first_year['points_per_match']:+.1f})")

=== WTA Change: 2012 → 2017 ===
Serve %:   55.78% → 56.53% (+0.75%)
Return %:  44.22% → 43.47% (-0.75%)
Ace %:     0.83% → 3.43% (+2.60%)
DF %:      1.46% → 4.63% (+3.17%)
Pts/Match: 138.6 → 144.4 (+5.8)


## 5. Side-by-Side Comparison

In [9]:
# Merge ATP and WTA yearly stats
comparison = atp_yearly[['year', 'serve_pct', 'ace_pct', 'df_pct', 'points_per_match']].copy()
comparison.columns = ['year', 'atp_serve_pct', 'atp_ace_pct', 'atp_df_pct', 'atp_pts_per_match']

wta_subset = wta_yearly[['year', 'serve_pct', 'ace_pct', 'df_pct', 'points_per_match']].copy()
wta_subset.columns = ['year', 'wta_serve_pct', 'wta_ace_pct', 'wta_df_pct', 'wta_pts_per_match']

comparison = comparison.merge(wta_subset, on='year', how='inner')

print("=== ATP vs WTA Serve % by Year ===")
for _, row in comparison.iterrows():
    print(f"{int(row['year'])}: ATP {row['atp_serve_pct']*100:5.2f}%  |  WTA {row['wta_serve_pct']*100:5.2f}%  |  Gap: {(row['atp_serve_pct'] - row['wta_serve_pct'])*100:+.2f}%")

=== ATP vs WTA Serve % by Year ===
2011: ATP 62.74%  |  WTA 56.17%  |  Gap: +6.57%
2012: ATP 63.12%  |  WTA 55.78%  |  Gap: +7.34%
2013: ATP 63.36%  |  WTA 55.66%  |  Gap: +7.70%
2014: ATP 63.84%  |  WTA 56.06%  |  Gap: +7.78%
2015: ATP 64.15%  |  WTA 56.24%  |  Gap: +7.91%
2017: ATP 63.56%  |  WTA 56.53%  |  Gap: +7.03%


## 6. Trend Analysis

In [10]:
# Simple linear regression to detect trends
from scipy import stats as scipy_stats

def analyze_trend(yearly_df, metric, tour_name):
    """Calculate linear trend for a metric."""
    # Filter to years with enough data
    df = yearly_df[yearly_df['matches'] >= 300].copy()
    if len(df) < 3:
        return None
    
    slope, intercept, r_value, p_value, std_err = scipy_stats.linregress(df['year'], df[metric])
    
    # Slope per year as percentage points
    slope_pct = slope * 100
    
    trend = "increasing" if slope > 0 else "decreasing"
    significance = "significant" if p_value < 0.05 else "not significant"
    
    return {
        'tour': tour_name,
        'metric': metric,
        'slope_per_year': slope_pct,
        'r_squared': r_value**2,
        'p_value': p_value,
        'trend': trend,
        'significance': significance
    }

print("=== Trend Analysis (Linear Regression) ===")
print()

metrics = ['serve_pct', 'ace_pct', 'df_pct']
results = []

for metric in metrics:
    atp_trend = analyze_trend(atp_yearly, metric, 'ATP')
    wta_trend = analyze_trend(wta_yearly, metric, 'WTA')
    if atp_trend:
        results.append(atp_trend)
    if wta_trend:
        results.append(wta_trend)

trend_df = pd.DataFrame(results)
trend_df['slope_per_year'] = trend_df['slope_per_year'].round(3)
trend_df['r_squared'] = trend_df['r_squared'].round(3)
trend_df['p_value'] = trend_df['p_value'].round(4)
trend_df

=== Trend Analysis (Linear Regression) ===



Unnamed: 0,tour,metric,slope_per_year,r_squared,p_value,trend,significance
0,ATP,serve_pct,0.168,0.517,0.1074,increasing,not significant
1,WTA,serve_pct,0.095,0.425,0.1608,increasing,not significant
2,ATP,ace_pct,1.285,0.81,0.0144,increasing,significant
3,WTA,ace_pct,0.607,0.813,0.0141,increasing,significant
4,ATP,df_pct,0.62,0.86,0.0077,increasing,significant
5,WTA,df_pct,0.778,0.741,0.0276,increasing,significant


## 7. Summary

In [11]:
print("=" * 60)
print("SUMMARY: Game Style Trends (2011-2017)")
print("=" * 60)
print()

# ATP summary
atp_full = atp_yearly[atp_yearly['matches'] >= 500]
if len(atp_full) >= 2:
    print("ATP Tour:")
    serve_change = (atp_full.iloc[-1]['serve_pct'] - atp_full.iloc[0]['serve_pct']) * 100
    ace_change = (atp_full.iloc[-1]['ace_pct'] - atp_full.iloc[0]['ace_pct']) * 100
    if serve_change > 0.5:
        print(f"  • Serve dominance INCREASED (+{serve_change:.1f}% serve points won)")
    elif serve_change < -0.5:
        print(f"  • Serve dominance DECREASED ({serve_change:.1f}% serve points won)")
    else:
        print(f"  • Serve dominance relatively STABLE ({serve_change:+.1f}%)")
    
    if ace_change > 0.2:
        print(f"  • Aces INCREASED (+{ace_change:.2f}% of points)")
    elif ace_change < -0.2:
        print(f"  • Aces DECREASED ({ace_change:.2f}% of points)")
    else:
        print(f"  • Aces relatively stable ({ace_change:+.2f}%)")

print()

# WTA summary
wta_full = wta_yearly[wta_yearly['matches'] >= 500]
if len(wta_full) >= 2:
    print("WTA Tour:")
    serve_change = (wta_full.iloc[-1]['serve_pct'] - wta_full.iloc[0]['serve_pct']) * 100
    ace_change = (wta_full.iloc[-1]['ace_pct'] - wta_full.iloc[0]['ace_pct']) * 100
    if serve_change > 0.5:
        print(f"  • Serve dominance INCREASED (+{serve_change:.1f}% serve points won)")
    elif serve_change < -0.5:
        print(f"  • Serve dominance DECREASED ({serve_change:.1f}% serve points won)")
    else:
        print(f"  • Serve dominance relatively STABLE ({serve_change:+.1f}%)")
    
    if ace_change > 0.2:
        print(f"  • Aces INCREASED (+{ace_change:.2f}% of points)")
    elif ace_change < -0.2:
        print(f"  • Aces DECREASED ({ace_change:.2f}% of points)")
    else:
        print(f"  • Aces relatively stable ({ace_change:+.2f}%)")

print()
print("Note: Data covers 2011-2017. More recent data would be needed")
print("to assess current trends (e.g., impact of new racket tech).")

SUMMARY: Game Style Trends (2011-2017)

ATP Tour:
  • Serve dominance INCREASED (+0.8% serve points won)
  • Aces INCREASED (+7.10% of points)

WTA Tour:
  • Serve dominance INCREASED (+0.7% serve points won)
  • Aces INCREASED (+2.60% of points)

Note: Data covers 2011-2017. More recent data would be needed
to assess current trends (e.g., impact of new racket tech).
