### Load Dataset

In [119]:
import pandas as pd

matches = pd.read_csv("../data/processed/matches_base.csv")

matches.head()


Unnamed: 0,year,tourney_date,surface,tourney_level,winner_id,winner_name,winner_rank,winner_age,winner_ht,loser_id,...,loser_age,loser_ht,w_ace,w_df,w_bpSaved,w_bpFaced,l_ace,l_df,l_bpSaved,l_bpFaced
0,2018,20180101,Hard,A,105992,Ryan Harrison,47.0,25.6,185.0,104919,...,30.6,188.0,9.0,2.0,8.0,9.0,10.0,3.0,1.0,4.0
1,2018,20180101,Hard,A,111577,Jared Donaldson,54.0,21.2,188.0,111442,...,23.7,183.0,5.0,3.0,4.0,5.0,3.0,5.0,7.0,11.0
2,2018,20180101,Hard,A,104797,Denis Istomin,63.0,31.3,185.0,106000,...,25.6,175.0,7.0,0.0,9.0,11.0,8.0,6.0,10.0,16.0
3,2018,20180101,Hard,A,200282,Alex De Minaur,208.0,18.8,183.0,105449,...,28.0,188.0,9.0,3.0,2.0,3.0,6.0,2.0,4.0,6.0
4,2018,20180101,Hard,A,111581,Michael Mmoh,175.0,19.9,188.0,105643,...,27.2,193.0,5.0,4.0,3.0,3.0,4.0,0.0,0.0,2.0


### Filter Years for Clustering (2018–2023)

In [120]:
cluster_matches = matches[matches["year"] <= 2023].copy()

print(cluster_matches.shape)

(15801, 22)


### Build Winner and Loser DataFrames

In [121]:
winner_df = cluster_matches[[
    "winner_id", "winner_name", "surface",
    "winner_rank", "winner_age", "winner_ht",
    "w_ace", "w_df", "w_bpFaced"
]].copy()

winner_df.columns = [
    "player_id", "player_name", "surface",
    "rank", "age", "height",
    "aces", "double_faults", "bp_faced"
]

winner_df["win"] = 1

winner_df.head()


Unnamed: 0,player_id,player_name,surface,rank,age,height,aces,double_faults,bp_faced,win
0,105992,Ryan Harrison,Hard,47.0,25.6,185.0,9.0,2.0,9.0,1
1,111577,Jared Donaldson,Hard,54.0,21.2,188.0,5.0,3.0,5.0,1
2,104797,Denis Istomin,Hard,63.0,31.3,185.0,7.0,0.0,11.0,1
3,200282,Alex De Minaur,Hard,208.0,18.8,183.0,9.0,3.0,3.0,1
4,111581,Michael Mmoh,Hard,175.0,19.9,188.0,5.0,4.0,3.0,1


In [122]:
loser_df = cluster_matches[[
    "loser_id", "loser_name", "surface",
    "loser_rank", "loser_age", "loser_ht",
    "l_ace", "l_df", "l_bpFaced"
]].copy()

loser_df.columns = [
    "player_id", "player_name", "surface",
    "rank", "age", "height",
    "aces", "double_faults", "bp_faced"
]

loser_df["win"] = 0

loser_df.head(2)


Unnamed: 0,player_id,player_name,surface,rank,age,height,aces,double_faults,bp_faced,win
0,104919,Leonardo Mayer,Hard,52.0,30.6,188.0,10.0,3.0,4.0,0
1,111442,Jordan Thompson,Hard,94.0,23.7,183.0,3.0,5.0,11.0,0


In [123]:
players_matches = pd.concat([winner_df, loser_df], ignore_index=True)

### Aggregate Player Statistics

In [124]:
player_stats = players_matches.groupby("player_id").agg(
    player_name=("player_name", "first"),
    matches_played=("win", "count"),
    win_rate=("win", "mean"),
    avg_aces=("aces", "mean"),
    avg_double_faults=("double_faults", "mean"),
    avg_bp_faced=("bp_faced", "mean"),
    avg_rank=("rank", "mean"),
    avg_age=("age", "mean"),
    avg_height=("height", "mean")
).reset_index()

player_stats.head()

Unnamed: 0,player_id,player_name,matches_played,win_rate,avg_aces,avg_double_faults,avg_bp_faced,avg_rank,avg_age,avg_height
0,100644,Alexander Zverev,384,0.71875,9.059299,4.005391,5.420485,7.234375,23.5625,198.0
1,103333,Ivo Karlovic,64,0.390625,22.296875,4.96875,4.125,101.296875,39.921875,208.0
2,103499,Aqeel Khan,10,0.6,7.714286,5.428571,9.0,1764.333333,40.93,
3,103529,Aisam Ul Haq Qureshi,6,0.666667,12.2,6.8,4.2,,40.316667,183.0
4,103565,Stephane Robert,4,0.25,7.5,4.5,10.5,283.75,38.55,185.0


### Surface Specialization Features

In [125]:
surface_winrate = players_matches.pivot_table(
    index="player_id",
    values="win",
    columns="surface",
    aggfunc="mean"
).reset_index()

surface_winrate.columns = ["player_id", "Winrate_Clay", "Winrate_Grass", "Winrate_Hard"]

surface_winrate.head()

Unnamed: 0,player_id,Winrate_Clay,Winrate_Grass,Winrate_Hard
0,100644,0.758333,0.590909,0.710744
1,103333,0.428571,0.4,0.375
2,103499,,0.666667,
3,103529,,0.666667,
4,103565,,0.5,0.0


In [126]:
player_stats = player_stats.merge(surface_winrate, on="player_id", how="left")
player_stats.head()

player_stats["player_id"].nunique()

# player_stats.isna().any(axis=1).sum()


847

### Handle Missing Values
We are going to replace the surface win rate NaN values with the overall win rate. For the numeric values like height, rank stats we are going to use the median value.

In [127]:
player_stats.to_csv("../data/processed/player_features.csv", index=False)

import numpy as np

player_features = player_stats.copy()

# Surface winrates → fill with overall win_rate
for col in ["Winrate_Clay", "Winrate_Grass", "Winrate_Hard"]:
    player_features[col] = player_features[col].fillna(player_features["win_rate"])

# Numeric features → fill with median
numeric_cols = [
    "avg_aces", "avg_double_faults", "avg_bp_faced",
    "avg_rank", "avg_age", "avg_height"
]

for col in numeric_cols:
    median_value = player_features[col].median()
    player_features[col] = player_features[col].fillna(median_value)

player_features.isna().mean()


player_id            0.0
player_name          0.0
matches_played       0.0
win_rate             0.0
avg_aces             0.0
avg_double_faults    0.0
avg_bp_faced         0.0
avg_rank             0.0
avg_age              0.0
avg_height           0.0
Winrate_Clay         0.0
Winrate_Grass        0.0
Winrate_Hard         0.0
dtype: float64

In [128]:
player_features.to_csv("../data/processed/player_features_clean.csv", index=False)


### Apply Reliability Filter
We are going to filter players by matches played. Some player have very few matches and their stats can be noisy and misleading. To reduce noise in player statistics, we filtered players based on a minimum match threshold, ensuring robust feature estimation before clustering

We experimented with different reliability thresholds (20 vs 30 matches) to balance statistical robustness and player diversity in clustering. We selected a minimum threshold of 15 matches per player to balance statistical reliability and stylistic diversity in clustering.

In [131]:
MIN_MATCHES = 15

print("Players before filter:", player_features.shape[0])

player_features_filtered = player_features[player_features["matches_played"] >= MIN_MATCHES].copy()

print("Players after filter:", player_features_filtered.shape[0])

Players before filter: 847
Players after filter: 272


In [130]:
player_features_filtered.to_csv("../data/processed/player_features_final.csv", index=False)
