In [None]:
import pandas as pd
import numpy as np
import glob
import pandas as pd
import re

In [None]:
# Find all ATP matches CSV files in ./data
csv_files = glob.glob('./data/atp_matches_*.csv')

# Numeric columns: compute means for winners and losers
numeric_cols = [
    'winner_rank', 'loser_rank',
    'winner_rank_points', 'loser_rank_points',
    'winner_age', 'loser_age',
    'winner_ht', 'loser_ht',
    'w_ace', 'l_ace',
    'w_df', 'l_df',
    'w_svpt', 'l_svpt',
    'w_1stIn', 'l_1stIn',
    'w_1stWon', 'l_1stWon',
    'w_2ndWon', 'l_2ndWon',
    'w_SvGms', 'l_SvGms',
    'w_bpSaved', 'l_bpSaved',
    'w_bpFaced', 'l_bpFaced'
    # Only include extended stats if present in the dataframe
]

all_means = []

yearly_means = {}

for file in csv_files:
    df = pd.read_csv(file)
    # Extract year from filename (e.g., atp_matches_2020.csv)
    match = re.search(r'atp_matches_(\d{4})', file)
    year = match.group(1) if match else None

    means = {}
    for col in numeric_cols:
        if col in df.columns:
            means[col] = df[col].mean()
            # Handle winner_/loser_ and w_/l_ pairs
            if col.startswith('w_'):
                loser_col = 'l_' + col[2:]
                if loser_col in df.columns:
                    means[loser_col] = df[loser_col].mean()
            elif col.startswith('l_'):
                winner_col = 'w_' + col[2:]
                if winner_col in df.columns:
                    means[winner_col] = df[winner_col].mean()
            elif col.startswith('winner_'):
                loser_col = 'loser_' + col[7:]
                if loser_col in df.columns:
                    means[loser_col] = df[loser_col].mean()
            elif col.startswith('loser_'):
                winner_col = 'winner_' + col[6:]
                if winner_col in df.columns:
                    means[winner_col] = df[winner_col].mean()
    all_means.append(means)

    # Store means by year if year is found
    if year:
        yearly_means.setdefault(year, []).append(means)

# Print averages for each year
print("\n--- Yearly Average Numeric Stats ---")
for year in sorted(yearly_means.keys()):
    year_means_df = pd.DataFrame(yearly_means[year])
    year_avg = year_means_df.mean()
    print(f"\nYear: {year}")
    for col in numeric_cols:
        if col in year_avg:
            if col.startswith('winner_'):
                loser_col = 'loser_' + col[len('winner_'):]
                if loser_col in year_avg:
                    print(f"{col} mean: {year_avg[col]:.2f} | {loser_col} mean: {year_avg[loser_col]:.2f}")
                else:
                    print(f"{col} mean: {year_avg[col]:.2f}")
            elif col.startswith('w_'):
                loser_col = 'l_' + col[len('w_'):]
                if loser_col in year_avg:
                    print(f"{col} mean: {year_avg[col]:.2f} | {loser_col} mean: {year_avg[loser_col]:.2f}")
                else:
                    print(f"{col} mean: {year_avg[col]:.2f}")
            elif col.startswith('loser_'):
                winner_col = 'winner_' + col[len('loser_'):]
                if winner_col not in numeric_cols:
                    print(f"{col} mean: {year_avg[col]:.2f}")
            elif col.startswith('l_'):
                winner_col = 'w_' + col[len('l_'):]
                if winner_col not in numeric_cols:
                    print(f"{col} mean: {year_avg[col]:.2f}")

# Compute the average of all datasets
means_df = pd.DataFrame(all_means)
average_means = means_df.mean()

print("\n--- Average Numeric Stats Across All Datasets ---")
for col in numeric_cols:
    # Handle winner_/loser_ pairs
    if col.startswith('winner_'):
        loser_col = 'loser_' + col[len('winner_'):]
        if loser_col in average_means:
            print(f"{col} mean: {average_means[col]:.2f} | {loser_col} mean: {average_means[loser_col]:.2f}")
        else:
            print(f"{col} mean: {average_means[col]:.2f}")
    # Handle w_/l_ pairs
    elif col.startswith('w_'):
        loser_col = 'l_' + col[len('w_'):]
        if loser_col in average_means:
            print(f"{col} mean: {average_means[col]:.2f} | {loser_col} mean: {average_means[loser_col]:.2f}")
        else:
            print(f"{col} mean: {average_means[col]:.2f}")
    # Handle loser_/winner_ pairs (if not already printed)
    elif col.startswith('loser_'):
        winner_col = 'winner_' + col[len('loser_'):]
        if winner_col not in numeric_cols:
            print(f"{col} mean: {average_means[col]:.2f}")
    # Handle l_/w_ pairs (if not already printed)
    elif col.startswith('l_'):
        winner_col = 'w_' + col[len('l_'):]
        if winner_col not in numeric_cols:
            print(f"{col} mean: {average_means[col]:.2f}")

  df = pd.read_csv(file)



--- Yearly Average Numeric Stats ---

Year: 1968
winner_rank mean: nan | loser_rank mean: nan
winner_rank_points mean: nan | loser_rank_points mean: nan
winner_age mean: 25.81 | loser_age mean: 25.44
winner_ht mean: 182.47 | loser_ht mean: 182.65
w_ace mean: nan | l_ace mean: nan
w_df mean: nan | l_df mean: nan
w_svpt mean: nan | l_svpt mean: nan
w_1stIn mean: nan | l_1stIn mean: nan
w_1stWon mean: nan | l_1stWon mean: nan
w_2ndWon mean: nan | l_2ndWon mean: nan
w_SvGms mean: nan | l_SvGms mean: nan
w_bpSaved mean: nan | l_bpSaved mean: nan
w_bpFaced mean: nan | l_bpFaced mean: nan

Year: 1969
winner_rank mean: nan | loser_rank mean: nan
winner_rank_points mean: nan | loser_rank_points mean: nan
winner_age mean: 26.12 | loser_age mean: 25.85
winner_ht mean: 182.43 | loser_ht mean: 182.61
w_ace mean: nan | l_ace mean: nan
w_df mean: nan | l_df mean: nan
w_svpt mean: nan | l_svpt mean: nan
w_1stIn mean: nan | l_1stIn mean: nan
w_1stWon mean: nan | l_1stWon mean: nan
w_2ndWon mean: nan |