# Lexin Deang | INST414 | Similarity Metrics project

https://github.com/JeffSackmann/tennis_atp

## Importing Files

In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import os

Concatenating all files from 2020 - 2024

In [64]:
data_folder = "Data"

all_files = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.csv')]

df = pd.concat([pd.read_csv(file) for file in all_files], ignore_index=True)

In [65]:
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2024-0339,Brisbane,Hard,32,A,20240101,300,105777,2.0,,...,58.0,44.0,16.0,11.0,8.0,9.0,14.0,2570.0,8.0,3660.0
1,2024-0339,Brisbane,Hard,32,A,20240101,299,208029,1.0,,...,35.0,31.0,10.0,11.0,5.0,7.0,8.0,3660.0,39.0,1122.0
2,2024-0339,Brisbane,Hard,32,A,20240101,298,105777,2.0,,...,39.0,24.0,14.0,10.0,5.0,7.0,14.0,2570.0,55.0,902.0
3,2024-0339,Brisbane,Hard,32,A,20240101,297,208029,1.0,,...,51.0,31.0,16.0,10.0,3.0,5.0,8.0,3660.0,116.0,573.0
4,2024-0339,Brisbane,Hard,32,A,20240101,296,126128,,,...,37.0,27.0,16.0,10.0,5.0,8.0,39.0,1122.0,44.0,1021.0


In [66]:
# --- Unified column map ---
common_cols = {
    'player_name': None,
    'age': None,
    'height': None,
    'rank_points': None,
    'ace': None,
    'df': None,
    'svpt': None,
    '1stIn': None,
    '1stWon': None,
    '2ndWon': None,
    'SvGms': None,
    'bpSaved': None,
    'bpFaced': None,
    'minutes': 'minutes'  # same for both
}

# Map winner and loser columns
winner_cols = {
    'player_name': 'winner_name',
    'age': 'winner_age',
    'height': 'winner_ht',
    'rank_points': 'winner_rank_points',
    'ace': 'w_ace',
    'df': 'w_df',
    'svpt': 'w_svpt',
    '1stIn': 'w_1stIn',
    '1stWon': 'w_1stWon',
    '2ndWon': 'w_2ndWon',
    'SvGms': 'w_SvGms',
    'bpSaved': 'w_bpSaved',
    'bpFaced': 'w_bpFaced',
    'minutes': 'minutes'
}

loser_cols = {
    'player_name': 'loser_name',
    'age': 'loser_age',
    'height': 'loser_ht',
    'rank_points': 'loser_rank_points',
    'ace': 'l_ace',
    'df': 'l_df',
    'svpt': 'l_svpt',
    '1stIn': 'l_1stIn',
    '1stWon': 'l_1stWon',
    '2ndWon': 'l_2ndWon',
    'SvGms': 'l_SvGms',
    'bpSaved': 'l_bpSaved',
    'bpFaced': 'l_bpFaced',
    'minutes': 'minutes'
}

# Extract and rename
winners_df = df[[v for v in winner_cols.values()]].rename(columns={v: k for k, v in winner_cols.items()})
losers_df = df[[v for v in loser_cols.values()]].rename(columns={v: k for k, v in loser_cols.items()})

# Combine
players_df = pd.concat([winners_df, losers_df], ignore_index=True)
players_df.dropna(inplace=True)  # Remove incomplete rows
players_df.head()

Unnamed: 0,player_name,age,height,rank_points,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,minutes
0,Grigor Dimitrov,32.6,191.0,2570.0,8.0,2.0,74.0,52.0,40.0,13.0,11.0,3.0,3.0,136.0
1,Holger Rune,20.6,188.0,3660.0,7.0,4.0,72.0,48.0,39.0,11.0,11.0,1.0,2.0,97.0
2,Grigor Dimitrov,32.6,191.0,2570.0,10.0,3.0,67.0,45.0,39.0,10.0,11.0,6.0,6.0,109.0
3,Holger Rune,20.6,188.0,3660.0,13.0,0.0,65.0,36.0,31.0,17.0,10.0,1.0,1.0,105.0
4,Roman Safiullin,26.4,185.0,1122.0,9.0,3.0,73.0,43.0,36.0,14.0,10.0,2.0,3.0,120.0


In [67]:
players_df["1st_serve_pct"] = players_df["1stIn"] / players_df["svpt"]
players_df["1st_serve_win_pct"] = players_df["1stWon"] / players_df["1stIn"]
players_df["2nd_serve_win_pct"] = players_df["2ndWon"] / (players_df["svpt"] - players_df["1stIn"])
players_df["bp_save_pct"] = players_df["bpSaved"] / players_df["bpFaced"]

# Clean up any division errors
players_df.replace([np.inf, -np.inf], np.nan, inplace=True) # Removes infinite values, replaces
players_df.dropna(inplace=True)

players_df.head()

Unnamed: 0,player_name,age,height,rank_points,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,minutes,1st_serve_pct,1st_serve_win_pct,2nd_serve_win_pct,bp_save_pct
0,Grigor Dimitrov,32.6,191.0,2570.0,8.0,2.0,74.0,52.0,40.0,13.0,11.0,3.0,3.0,136.0,0.702703,0.769231,0.590909,1.0
1,Holger Rune,20.6,188.0,3660.0,7.0,4.0,72.0,48.0,39.0,11.0,11.0,1.0,2.0,97.0,0.666667,0.8125,0.458333,0.5
2,Grigor Dimitrov,32.6,191.0,2570.0,10.0,3.0,67.0,45.0,39.0,10.0,11.0,6.0,6.0,109.0,0.671642,0.866667,0.454545,1.0
3,Holger Rune,20.6,188.0,3660.0,13.0,0.0,65.0,36.0,31.0,17.0,10.0,1.0,1.0,105.0,0.553846,0.861111,0.586207,1.0
4,Roman Safiullin,26.4,185.0,1122.0,9.0,3.0,73.0,43.0,36.0,14.0,10.0,2.0,3.0,120.0,0.589041,0.837209,0.466667,0.666667


In [68]:
agg_features = [
    "ace", "df", "1st_serve_pct",
    "1st_serve_win_pct", "2nd_serve_win_pct", "bp_save_pct",
    "SvGms"
]

# Grouping by player name under aggregate features means
player_profiles = players_df.groupby("player_name")[agg_features].mean()
player_profiles = player_profiles.dropna()
player_profiles.head()

Unnamed: 0_level_0,ace,df,1st_serve_pct,1st_serve_win_pct,2nd_serve_win_pct,bp_save_pct,SvGms
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abedallah Shelbayh,4.142857,3.5,0.586241,0.6685,0.451588,0.567509,10.071429
Adam Moundir,1.0,4.0,0.513514,0.736842,0.5,0.6,11.0
Adam Neff,4.0,8.0,0.569444,0.585366,0.387097,0.636364,8.0
Adam Walton,5.692308,2.153846,0.64683,0.734747,0.50581,0.632613,12.846154
Adria Soriano Barrera,8.5,5.5,0.606285,0.757755,0.48538,0.708333,12.5


In [69]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(player_profiles)

similarity_matrix = cosine_similarity(X_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=player_profiles.index, columns=player_profiles.index)

similarity_df

player_name,Abedallah Shelbayh,Adam Moundir,Adam Neff,Adam Walton,Adria Soriano Barrera,Adrian Andreev,Adrian Mannarino,Ajeet Rai,Alan Fernando Rubio Fierros,Alastair Gray,...,Yuki Bhambri,Yunseong Chung,Yuta Shimizu,Zachary Svajda,Zdenek Kolar,Zhe Li,Zhizhen Zhang,Zizou Bergs,Zsombor Piros,Zura Tkemaladze
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abedallah Shelbayh,1.000000,0.562377,0.756626,-0.687643,-0.024227,0.122630,-0.272428,0.469377,0.516585,0.213086,...,0.670786,-0.442468,0.668598,-0.802842,0.294609,0.263136,-0.785527,-0.749917,-0.375603,0.232473
Adam Moundir,0.562377,1.000000,0.291296,-0.226063,0.175905,-0.127547,0.449654,0.373750,0.553499,0.233485,...,0.910587,-0.614363,0.285390,-0.407805,-0.049129,0.335000,-0.368371,-0.273567,-0.575098,-0.235500
Adam Neff,0.756626,0.291296,1.000000,-0.724011,0.294834,0.054241,-0.621209,0.483125,0.086673,0.600225,...,0.494377,-0.675797,0.431888,-0.770349,0.440618,-0.240633,-0.842834,-0.491219,-0.463522,0.303399
Adam Walton,-0.687643,-0.226063,-0.724011,1.000000,0.313387,-0.284516,0.393866,-0.694179,-0.133077,-0.101387,...,-0.305428,0.223929,-0.901788,0.382579,-0.485924,-0.363339,0.795111,0.819410,0.291949,-0.229437
Adria Soriano Barrera,-0.024227,0.175905,0.294834,0.313387,1.000000,-0.735469,-0.071853,0.153546,0.087514,0.709570,...,0.299254,-0.747973,-0.603584,-0.526690,-0.226504,-0.552733,0.165325,0.628295,-0.583621,-0.443801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zhe Li,0.263136,0.335000,-0.240633,-0.363339,-0.552733,0.022961,0.514199,0.493485,0.514953,-0.600187,...,0.143695,0.313260,0.667233,0.096560,-0.155289,1.000000,0.048666,-0.459885,-0.084139,-0.356442
Zhizhen Zhang,-0.785527,-0.368371,-0.842834,0.795111,0.165325,-0.420794,0.530776,-0.320268,-0.077025,-0.396879,...,-0.540485,0.430883,-0.657566,0.512209,-0.646005,0.048666,1.000000,0.798245,0.243905,-0.547758
Zizou Bergs,-0.749917,-0.273567,-0.491219,0.819410,0.628295,-0.552219,0.316288,-0.324321,-0.297442,0.146314,...,-0.334831,-0.061396,-0.907741,0.253381,-0.560997,-0.459885,0.798245,1.000000,0.011072,-0.486897
Zsombor Piros,-0.375603,-0.575098,-0.463522,0.291949,-0.583621,0.717310,0.040717,-0.743398,-0.591425,-0.694046,...,-0.742452,0.823331,-0.090047,0.577185,-0.265042,-0.084139,0.243905,0.011072,1.000000,0.606327


In [70]:
def show_top_similar_players(player_name, top_n=10):
    if player_name not in similarity_df:
        print(f"⚠️ Player '{player_name}' not found.")
        return
    sims = similarity_df[player_name].sort_values(ascending=False)
    print(f"Top {top_n} most similar players to {player_name}:\n")
    print(sims[1:top_n+1])  # Skip self-match at index 0

In [71]:
show_top_similar_players('Novak Djokovic')

Top 10 most similar players to Novak Djokovic:

player_name
Stefanos Tsitsipas    0.977254
Ugo Humbert           0.962963
Roman Safiullin       0.952983
Zizou Bergs           0.943447
Dominic Thiem         0.930261
Karen Khachanov       0.927665
Ilya Ivashka          0.920699
Sebastian Korda       0.919478
Lukas Klein           0.905444
Andrey Rublev         0.893287
Name: Novak Djokovic, dtype: float64


In [72]:
show_top_similar_players('Carlos Alcaraz')

Top 10 most similar players to Carlos Alcaraz:

player_name
Rafael Nadal             0.953119
Casper Ruud              0.945037
Cameron Norrie           0.941398
Mitchell Krueger         0.938715
Pablo Carreno Busta      0.937473
Roberto Bautista Agut    0.934895
Lukas Lacko              0.910907
Alex Michelsen           0.885508
Juncheng Shang           0.883033
Filip Krajinovic         0.871522
Name: Carlos Alcaraz, dtype: float64


In [73]:
show_top_similar_players('Rafael Nadal')

Top 10 most similar players to Rafael Nadal:

player_name
Carlos Alcaraz      0.953119
Casper Ruud         0.916817
Cameron Norrie      0.910636
Emil Ruusuvuori     0.905496
Nuno Borges         0.897850
Mitchell Krueger    0.881985
Juncheng Shang      0.878852
Dominic Thiem       0.878346
Lukas Lacko         0.874037
Novak Djokovic      0.860885
Name: Rafael Nadal, dtype: float64
