# Convert Match Charting Data to Point-by-Point Format

The Match Charting Project contains shot-by-shot data. We'll convert this to point-by-point format compatible with our simulator, giving us recent data (through 2025).

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)

## 1. Load Match Charting Data

In [None]:
data_dir = Path('../data/raw/tennis_charting')

# Load matches metadata
matches_m = pd.read_csv(data_dir / 'charting-m-matches.csv')
matches_w = pd.read_csv(data_dir / 'charting-w-matches.csv')

print(f"Men's matches: {len(matches_m):,}")
print(f"Women's matches: {len(matches_w):,}")

# Preview
matches_m[['match_id', 'Player 1', 'Player 2', 'Date', 'Tournament', 'Surface']].head(3)

In [None]:
# Load points data (all eras)
print("Loading points data...")

points_m = pd.concat([
    pd.read_csv(data_dir / 'charting-m-points-to-2009.csv', low_memory=False),
    pd.read_csv(data_dir / 'charting-m-points-2010s.csv', low_memory=False),
    pd.read_csv(data_dir / 'charting-m-points-2020s.csv', low_memory=False)
], ignore_index=True)

points_w = pd.concat([
    pd.read_csv(data_dir / 'charting-w-points-to-2009.csv', low_memory=False),
    pd.read_csv(data_dir / 'charting-w-points-2010s.csv', low_memory=False),
    pd.read_csv(data_dir / 'charting-w-points-2020s.csv', low_memory=False)
], ignore_index=True)

print(f"Men's points: {len(points_m):,}")
print(f"Women's points: {len(points_w):,}")
print(f"Total points: {len(points_m) + len(points_w):,}")

In [None]:
# Preview points data
points_m[['match_id', 'Pt', 'Set1', 'Set2', 'Gm1', 'Gm2', 'Pts', 'Svr', 'PtWinner']].head(10)

## 2. Parse Dates and Check Coverage

In [None]:
def parse_match_year(match_id):
    """Extract year from match_id (format: YYYYMMDD-...)."""
    try:
        date_str = match_id.split('-')[0]
        if len(date_str) == 8:
            return int(date_str[:4])
    except:
        pass
    return None

def convert_to_pbp(points_df: pd.DataFrame, matches_df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert shot-by-shot charting data to point-by-point format.
    
    Returns a DataFrame with one row per match containing:
    - match_id, player1, player2, year, tournament, surface
    - pbp string (S/R/A/D format)
    - Point counts and outcomes
    """
    # Add year column and drop duplicates
    matches_df = matches_df.copy()
    matches_df['year'] = matches_df['match_id'].apply(parse_match_year)
    matches_unique = matches_df.drop_duplicates(subset='match_id', keep='first')
    matches_lookup = matches_unique.set_index('match_id')[['Player 1', 'Player 2', 'Tournament', 'Surface', 'year']].to_dict('index')
    
    results = []
    match_ids = points_df['match_id'].unique()
    total = len(match_ids)
    
    for i, match_id in enumerate(match_ids):
        if i % 500 == 0:
            print(f"  Processing {i}/{total}...")
        
        if match_id not in matches_lookup:
            continue
            
        match_info = matches_lookup[match_id]
        match_points = points_df[points_df['match_id'] == match_id].sort_values('Pt')
        
        # Build point-by-point string
        pbp_chars = []
        last_set = (0, 0)
        last_game = (0, 0)
        
        # Stats
        p1_serve_pts = 0
        p1_serve_won = 0
        p2_serve_pts = 0
        p2_serve_won = 0
        p1_aces = 0
        p2_aces = 0
        p1_dfs = 0
        p2_dfs = 0
        
        for _, pt in match_points.iterrows():
            svr = pt['Svr']
            pt_winner = pt['PtWinner']
            
            # Check for NaN values
            if pd.isna(svr) or pd.isna(pt_winner):
                continue
            
            svr = int(svr)
            pt_winner = int(pt_winner)
            
            # Check for game/set changes to add delimiters
            current_set = (pt['Set1'], pt['Set2'])
            current_game = (pt['Gm1'], pt['Gm2'])
            
            if current_set != last_set and last_set != (0, 0):
                pbp_chars.append('.')
            elif current_game != last_game and last_game != (0, 0):
                pbp_chars.append(';')
            
            last_set = current_set
            last_game = current_game
            
            # Determine point outcome
            is_ace = str(pt.get('isAce', '')).upper() == 'TRUE'
            is_double = str(pt.get('isDouble', '')).upper() == 'TRUE'
            server_won = (svr == pt_winner)
            
            # Update stats
            if svr == 1:
                p1_serve_pts += 1
                if server_won:
                    p1_serve_won += 1
                if is_ace:
                    p1_aces += 1
                if is_double:
                    p1_dfs += 1
            else:
                p2_serve_pts += 1
                if server_won:
                    p2_serve_won += 1
                if is_ace:
                    p2_aces += 1
                if is_double:
                    p2_dfs += 1
            
            # Build pbp character
            if is_ace:
                pbp_chars.append('A')
            elif is_double:
                pbp_chars.append('D')
            elif server_won:
                pbp_chars.append('S')
            else:
                pbp_chars.append('R')
        
        pbp_string = ''.join(pbp_chars)
        total_points = len([c for c in pbp_string if c in 'SRAD'])
        
        if total_points < 10:  # Skip incomplete matches
            continue
        
        results.append({
            'match_id': match_id,
            'player1': match_info['Player 1'],
            'player2': match_info['Player 2'],
            'year': match_info['year'],
            'tournament': match_info['Tournament'],
            'surface': match_info['Surface'],
            'pbp': pbp_string,
            'total_points': total_points,
            'p1_serve_pts': p1_serve_pts,
            'p1_serve_won': p1_serve_won,
            'p2_serve_pts': p2_serve_pts,
            'p2_serve_won': p2_serve_won,
            'p1_aces': p1_aces,
            'p2_aces': p2_aces,
            'p1_dfs': p1_dfs,
            'p2_dfs': p2_dfs
        })
    
    return pd.DataFrame(results)

print("Converting men's matches...")
pbp_m = convert_to_pbp(points_m, matches_m)
print(f"Converted {len(pbp_m):,} men's matches")

print("\nConverting women's matches...")
pbp_w = convert_to_pbp(points_w, matches_w)
print(f"Converted {len(pbp_w):,} women's matches")

In [None]:
# Year distribution summary
print("=== Coverage Summary ===")
print(f"Men's: {matches_m['year'].min()} - {matches_m['year'].max()} ({len(matches_m):,} matches)")
print(f"Women's: {matches_w['year'].min()} - {matches_w['year'].max()} ({len(matches_w):,} matches)")

# Recent years
recent_m = matches_m[matches_m['year'] >= 2020]
recent_w = matches_w[matches_w['year'] >= 2020]
print(f"\nRecent (2020+): {len(recent_m):,} men's, {len(recent_w):,} women's")

## 3. Convert to Point-by-Point Format

In [None]:
def convert_to_pbp(points_df: pd.DataFrame, matches_df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert shot-by-shot charting data to point-by-point format.
    
    Returns a DataFrame with one row per match containing:
    - match_id, player1, player2, year, tournament, surface
    - pbp string (S/R/A/D format)
    - Point counts and outcomes
    """
    # Merge points with match metadata
    matches_lookup = matches_df.set_index('match_id')[['Player 1', 'Player 2', 'Tournament', 'Surface', 'year']].to_dict('index')
    
    results = []
    
    for match_id, match_points in points_df.groupby('match_id'):
        if match_id not in matches_lookup:
            continue
            
        match_info = matches_lookup[match_id]
        match_points = match_points.sort_values('Pt')
        
        # Build point-by-point string
        pbp_chars = []
        current_game_server = None
        last_set = (0, 0)
        last_game = (0, 0)
        
        # Stats
        p1_serve_pts = 0
        p1_serve_won = 0
        p2_serve_pts = 0
        p2_serve_won = 0
        p1_aces = 0
        p2_aces = 0
        p1_dfs = 0
        p2_dfs = 0
        
        for _, pt in match_points.iterrows():
            svr = pt['Svr']
            pt_winner = pt['PtWinner']
            
            # Check for NaN values
            if pd.isna(svr) or pd.isna(pt_winner):
                continue
            
            svr = int(svr)
            pt_winner = int(pt_winner)
            
            # Check for game/set changes to add delimiters
            current_set = (pt['Set1'], pt['Set2'])
            current_game = (pt['Gm1'], pt['Gm2'])
            
            if current_set != last_set and last_set != (0, 0):
                pbp_chars.append('.')
            elif current_game != last_game and last_game != (0, 0):
                pbp_chars.append(';')
            
            last_set = current_set
            last_game = current_game
            
            # Determine point outcome
            is_ace = str(pt.get('isAce', '')).upper() == 'TRUE'
            is_double = str(pt.get('isDouble', '')).upper() == 'TRUE'
            server_won = (svr == pt_winner)
            
            # Update stats
            if svr == 1:
                p1_serve_pts += 1
                if server_won:
                    p1_serve_won += 1
                if is_ace:
                    p1_aces += 1
                if is_double:
                    p1_dfs += 1
            else:
                p2_serve_pts += 1
                if server_won:
                    p2_serve_won += 1
                if is_ace:
                    p2_aces += 1
                if is_double:
                    p2_dfs += 1
            
            # Build pbp character
            if is_ace:
                pbp_chars.append('A')
            elif is_double:
                pbp_chars.append('D')
            elif server_won:
                pbp_chars.append('S')
            else:
                pbp_chars.append('R')
        
        pbp_string = ''.join(pbp_chars)
        total_points = len([c for c in pbp_string if c in 'SRAD'])
        
        if total_points < 10:  # Skip incomplete matches
            continue
        
        results.append({
            'match_id': match_id,
            'player1': match_info['Player 1'],
            'player2': match_info['Player 2'],
            'year': match_info['year'],
            'tournament': match_info['Tournament'],
            'surface': match_info['Surface'],
            'pbp': pbp_string,
            'total_points': total_points,
            'p1_serve_pts': p1_serve_pts,
            'p1_serve_won': p1_serve_won,
            'p2_serve_pts': p2_serve_pts,
            'p2_serve_won': p2_serve_won,
            'p1_aces': p1_aces,
            'p2_aces': p2_aces,
            'p1_dfs': p1_dfs,
            'p2_dfs': p2_dfs
        })
    
    return pd.DataFrame(results)

print("Converting men's matches...")
pbp_m = convert_to_pbp(points_m, matches_m)
print(f"Converted {len(pbp_m):,} men's matches")

print("\nConverting women's matches...")
pbp_w = convert_to_pbp(points_w, matches_w)
print(f"Converted {len(pbp_w):,} women's matches")

In [None]:
# Preview converted data
print("=== Sample Converted Match ===")
sample = pbp_m.iloc[0]
print(f"Match: {sample['player1']} vs {sample['player2']}")
print(f"Tournament: {sample['tournament']} ({sample['year']})")
print(f"Surface: {sample['surface']}")
print(f"Total points: {sample['total_points']}")
print(f"\nPBP string (first 200 chars):")
print(sample['pbp'][:200])

In [None]:
# Verify conversion looks correct
print("=== Conversion Verification ===")
print(f"\n{sample['player1']} (served first):")
print(f"  Serve points: {sample['p1_serve_pts']}")
print(f"  Serve won: {sample['p1_serve_won']} ({100*sample['p1_serve_won']/sample['p1_serve_pts']:.1f}%)")
print(f"  Aces: {sample['p1_aces']}")
print(f"  DFs: {sample['p1_dfs']}")

print(f"\n{sample['player2']}:")
print(f"  Serve points: {sample['p2_serve_pts']}")
print(f"  Serve won: {sample['p2_serve_won']} ({100*sample['p2_serve_won']/sample['p2_serve_pts']:.1f}%)")
print(f"  Aces: {sample['p2_aces']}")
print(f"  DFs: {sample['p2_dfs']}")

## 4. Coverage Analysis

In [None]:
print("=== Men's Matches by Year (Converted) ===")
print(pbp_m['year'].value_counts().sort_index())

In [None]:
print("=== Women's Matches by Year (Converted) ===")
print(pbp_w['year'].value_counts().sort_index())

In [None]:
# Recent years focus
print("=== Recent Coverage (2020-2025) ===")
recent_m = pbp_m[pbp_m['year'] >= 2020]
recent_w = pbp_w[pbp_w['year'] >= 2020]

print(f"\nMen's 2020-2025: {len(recent_m):,} matches")
print(recent_m['year'].value_counts().sort_index())

print(f"\nWomen's 2020-2025: {len(recent_w):,} matches")
print(recent_w['year'].value_counts().sort_index())

In [None]:
# Top players in recent data
def get_player_match_counts(df):
    """Count matches per player."""
    players = pd.concat([df['player1'], df['player2']])
    return players.value_counts()

print("=== Top 20 Men's Players (2020-2025 matches) ===")
get_player_match_counts(recent_m).head(20)

In [None]:
print("=== Top 20 Women's Players (2020-2025 matches) ===")
get_player_match_counts(recent_w).head(20)

In [None]:
# Tournament coverage
print("=== Top Tournaments in Recent Data (Men's 2020-2025) ===")
print(recent_m['tournament'].value_counts().head(15))

## 5. Calculate Player Stats from Converted Data

In [None]:
def calculate_player_stats(df: pd.DataFrame, min_matches: int = 5) -> pd.DataFrame:
    """
    Calculate serve/return stats for each player from converted pbp data.
    """
    player_stats = defaultdict(lambda: {
        'matches': 0,
        'serve_pts': 0,
        'serve_won': 0,
        'return_pts': 0,
        'return_won': 0,
        'aces': 0,
        'dfs': 0
    })
    
    for _, row in df.iterrows():
        # Player 1 stats
        p1 = player_stats[row['player1']]
        p1['matches'] += 1
        p1['serve_pts'] += row['p1_serve_pts']
        p1['serve_won'] += row['p1_serve_won']
        p1['return_pts'] += row['p2_serve_pts']
        p1['return_won'] += row['p2_serve_pts'] - row['p2_serve_won']
        p1['aces'] += row['p1_aces']
        p1['dfs'] += row['p1_dfs']
        
        # Player 2 stats
        p2 = player_stats[row['player2']]
        p2['matches'] += 1
        p2['serve_pts'] += row['p2_serve_pts']
        p2['serve_won'] += row['p2_serve_won']
        p2['return_pts'] += row['p1_serve_pts']
        p2['return_won'] += row['p1_serve_pts'] - row['p1_serve_won']
        p2['aces'] += row['p2_aces']
        p2['dfs'] += row['p2_dfs']
    
    # Convert to DataFrame
    rows = []
    for player, stats in player_stats.items():
        if stats['matches'] < min_matches:
            continue
        
        serve_pct = stats['serve_won'] / stats['serve_pts'] if stats['serve_pts'] > 0 else 0
        return_pct = stats['return_won'] / stats['return_pts'] if stats['return_pts'] > 0 else 0
        ace_pct = stats['aces'] / stats['serve_pts'] if stats['serve_pts'] > 0 else 0
        df_pct = stats['dfs'] / stats['serve_pts'] if stats['serve_pts'] > 0 else 0
        
        rows.append({
            'player': player,
            'matches': stats['matches'],
            'serve_pts': stats['serve_pts'],
            'serve_pct': serve_pct,
            'return_pct': return_pct,
            'combined': serve_pct + return_pct,
            'ace_pct': ace_pct,
            'df_pct': df_pct
        })
    
    return pd.DataFrame(rows).sort_values('combined', ascending=False)

# Calculate for recent data
recent_stats_m = calculate_player_stats(recent_m, min_matches=10)
recent_stats_w = calculate_player_stats(recent_w, min_matches=10)

print(f"Men's players with 10+ matches (2020-2025): {len(recent_stats_m)}")
print(f"Women's players with 10+ matches (2020-2025): {len(recent_stats_w)}")

In [None]:
print("=== Top 20 Men by Combined Serve+Return % (2020-2025) ===")
display_m = recent_stats_m.head(20).copy()
display_m['serve_pct'] = (display_m['serve_pct'] * 100).round(1)
display_m['return_pct'] = (display_m['return_pct'] * 100).round(1)
display_m['combined'] = (display_m['combined'] * 100).round(1)
display_m['ace_pct'] = (display_m['ace_pct'] * 100).round(1)
display_m[['player', 'matches', 'serve_pct', 'return_pct', 'combined', 'ace_pct']]

In [None]:
print("=== Top 20 Women by Combined Serve+Return % (2020-2025) ===")
display_w = recent_stats_w.head(20).copy()
display_w['serve_pct'] = (display_w['serve_pct'] * 100).round(1)
display_w['return_pct'] = (display_w['return_pct'] * 100).round(1)
display_w['combined'] = (display_w['combined'] * 100).round(1)
display_w['ace_pct'] = (display_w['ace_pct'] * 100).round(1)
display_w[['player', 'matches', 'serve_pct', 'return_pct', 'combined', 'ace_pct']]

## 6. Compare to Old Point-by-Point Data

In [None]:
# Load old data for comparison
old_stats_m = pd.read_csv('../data/processed/atp_player_stats.csv')

# Find players in both datasets
common_players = set(recent_stats_m['player']) & set(old_stats_m['player'])
print(f"Players in both old (2011-2017) and new (2020-2025) data: {len(common_players)}")

# Compare stats for common players
if len(common_players) > 0:
    print("\n=== Stat Comparison for Common Players ===")
    for player in list(common_players)[:5]:
        old = old_stats_m[old_stats_m['player'] == player].iloc[0]
        new = recent_stats_m[recent_stats_m['player'] == player].iloc[0]
        
        print(f"\n{player}:")
        print(f"  Serve %: {old['serve_pct']*100:.1f}% (old) → {new['serve_pct']*100:.1f}% (new)")
        print(f"  Return %: {old['return_pct']*100:.1f}% (old) → {new['return_pct']*100:.1f}% (new)")

## 7. Save Converted Data

In [None]:
output_dir = Path('../data/processed')

# Save full converted pbp data
pbp_m.to_csv(output_dir / 'charting_pbp_mens.csv', index=False)
pbp_w.to_csv(output_dir / 'charting_pbp_womens.csv', index=False)

# Save recent player stats
recent_stats_m.to_csv(output_dir / 'charting_player_stats_mens_2020_2025.csv', index=False)
recent_stats_w.to_csv(output_dir / 'charting_player_stats_womens_2020_2025.csv', index=False)

print("Saved files:")
print(f"  - charting_pbp_mens.csv ({len(pbp_m):,} matches)")
print(f"  - charting_pbp_womens.csv ({len(pbp_w):,} matches)")
print(f"  - charting_player_stats_mens_2020_2025.csv ({len(recent_stats_m)} players)")
print(f"  - charting_player_stats_womens_2020_2025.csv ({len(recent_stats_w)} players)")

## 8. Summary

In [None]:
print("=" * 60)
print("CONVERSION SUMMARY")
print("=" * 60)
print()
print("Successfully converted Match Charting Project shot-by-shot")
print("data to point-by-point format.")
print()
print("DATA VOLUME:")
print(f"  Men's matches: {len(pbp_m):,} (years: {pbp_m['year'].min()}-{pbp_m['year'].max()})")
print(f"  Women's matches: {len(pbp_w):,} (years: {pbp_w['year'].min()}-{pbp_w['year'].max()})")
print(f"  Total: {len(pbp_m) + len(pbp_w):,} matches")
print()
print("RECENT DATA (2020-2025):")
print(f"  Men's: {len(recent_m):,} matches, {len(recent_stats_m)} players with 10+ matches")
print(f"  Women's: {len(recent_w):,} matches, {len(recent_stats_w)} players with 10+ matches")
print()
print("ADVANTAGES OF THIS DATA:")
print("  ✓ Recent matches through 2025")
print("  ✓ Point-by-point sequences")
print("  ✓ Score context for each point")
print("  ✓ Surface information")
print()
print("LIMITATIONS:")
print("  • Not comprehensive (crowd-sourced, ~select matches)")
print("  • Biased toward big matches/top players")
print("  • Some players have limited match counts")