# Lexin Deang | INST414 | Similarity Metrics project

https://github.com/JeffSackmann/tennis_atp

## Importing Files

In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import os

Concatenating all files from 2020 - 2024

In [52]:
data_folder = "Data"

all_files = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.csv')]

df = pd.concat([pd.read_csv(file) for file in all_files], ignore_index=True)

In [53]:
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2024-0339,Brisbane,Hard,32,A,20240101,300,105777,2.0,,...,58.0,44.0,16.0,11.0,8.0,9.0,14.0,2570.0,8.0,3660.0
1,2024-0339,Brisbane,Hard,32,A,20240101,299,208029,1.0,,...,35.0,31.0,10.0,11.0,5.0,7.0,8.0,3660.0,39.0,1122.0
2,2024-0339,Brisbane,Hard,32,A,20240101,298,105777,2.0,,...,39.0,24.0,14.0,10.0,5.0,7.0,14.0,2570.0,55.0,902.0
3,2024-0339,Brisbane,Hard,32,A,20240101,297,208029,1.0,,...,51.0,31.0,16.0,10.0,3.0,5.0,8.0,3660.0,116.0,573.0
4,2024-0339,Brisbane,Hard,32,A,20240101,296,126128,,,...,37.0,27.0,16.0,10.0,5.0,8.0,39.0,1122.0,44.0,1021.0


In [54]:
# --- Unified column map ---
common_cols = {
    'player_name': None,
    'age': None,
    'height': None,
    'rank_points': None,
    'ace': None,
    'df': None,
    'svpt': None,
    '1stIn': None,
    '1stWon': None,
    '2ndWon': None,
    'SvGms': None,
    'bpSaved': None,
    'bpFaced': None,
    'minutes': 'minutes'  # same for both
}

# Map winner and loser columns
winner_cols = {
    'player_name': 'winner_name',
    'age': 'winner_age',
    'height': 'winner_ht',
    'rank_points': 'winner_rank_points',
    'ace': 'w_ace',
    'df': 'w_df',
    'svpt': 'w_svpt',
    '1stIn': 'w_1stIn',
    '1stWon': 'w_1stWon',
    '2ndWon': 'w_2ndWon',
    'SvGms': 'w_SvGms',
    'bpSaved': 'w_bpSaved',
    'bpFaced': 'w_bpFaced',
    'minutes': 'minutes'
}

loser_cols = {
    'player_name': 'loser_name',
    'age': 'loser_age',
    'height': 'loser_ht',
    'rank_points': 'loser_rank_points',
    'ace': 'l_ace',
    'df': 'l_df',
    'svpt': 'l_svpt',
    '1stIn': 'l_1stIn',
    '1stWon': 'l_1stWon',
    '2ndWon': 'l_2ndWon',
    'SvGms': 'l_SvGms',
    'bpSaved': 'l_bpSaved',
    'bpFaced': 'l_bpFaced',
    'minutes': 'minutes'
}

# Extract and rename
winners_df = df[[v for v in winner_cols.values()]].rename(columns={v: k for k, v in winner_cols.items()})
losers_df = df[[v for v in loser_cols.values()]].rename(columns={v: k for k, v in loser_cols.items()})

# Combine
players_df = pd.concat([winners_df, losers_df], ignore_index=True)
players_df.dropna(inplace=True)  # Remove incomplete rows
players_df.head()

Unnamed: 0,player_name,age,height,rank_points,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,minutes
0,Grigor Dimitrov,32.6,191.0,2570.0,8.0,2.0,74.0,52.0,40.0,13.0,11.0,3.0,3.0,136.0
1,Holger Rune,20.6,188.0,3660.0,7.0,4.0,72.0,48.0,39.0,11.0,11.0,1.0,2.0,97.0
2,Grigor Dimitrov,32.6,191.0,2570.0,10.0,3.0,67.0,45.0,39.0,10.0,11.0,6.0,6.0,109.0
3,Holger Rune,20.6,188.0,3660.0,13.0,0.0,65.0,36.0,31.0,17.0,10.0,1.0,1.0,105.0
4,Roman Safiullin,26.4,185.0,1122.0,9.0,3.0,73.0,43.0,36.0,14.0,10.0,2.0,3.0,120.0


In [55]:
players_df["1st_serve_pct"] = players_df["1stIn"] / players_df["svpt"]
players_df["1st_serve_win_pct"] = players_df["1stWon"] / players_df["1stIn"]
players_df["2nd_serve_win_pct"] = players_df["2ndWon"] / (players_df["svpt"] - players_df["1stIn"])
players_df["bp_save_pct"] = players_df["bpSaved"] / players_df["bpFaced"]

# Clean up any division errors
players_df.replace([np.inf, -np.inf], np.nan, inplace=True) # Removes infinite values, replaces
players_df.dropna(inplace=True)

players_df.head()

Unnamed: 0,player_name,age,height,rank_points,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,minutes,1st_serve_pct,1st_serve_win_pct,2nd_serve_win_pct,bp_save_pct
0,Grigor Dimitrov,32.6,191.0,2570.0,8.0,2.0,74.0,52.0,40.0,13.0,11.0,3.0,3.0,136.0,0.702703,0.769231,0.590909,1.0
1,Holger Rune,20.6,188.0,3660.0,7.0,4.0,72.0,48.0,39.0,11.0,11.0,1.0,2.0,97.0,0.666667,0.8125,0.458333,0.5
2,Grigor Dimitrov,32.6,191.0,2570.0,10.0,3.0,67.0,45.0,39.0,10.0,11.0,6.0,6.0,109.0,0.671642,0.866667,0.454545,1.0
3,Holger Rune,20.6,188.0,3660.0,13.0,0.0,65.0,36.0,31.0,17.0,10.0,1.0,1.0,105.0,0.553846,0.861111,0.586207,1.0
4,Roman Safiullin,26.4,185.0,1122.0,9.0,3.0,73.0,43.0,36.0,14.0,10.0,2.0,3.0,120.0,0.589041,0.837209,0.466667,0.666667


In [56]:
agg_features = [
    "age", "height", "rank_points", "ace", "df", "1st_serve_pct",
    "1st_serve_win_pct", "2nd_serve_win_pct", "bp_save_pct",
    "SvGms"
]

# Grouping by player name under aggregate features means
player_profiles = players_df.groupby("player_name")[agg_features].mean()
player_profiles = player_profiles.dropna()
player_profiles.head()

Unnamed: 0_level_0,age,height,rank_points,ace,df,1st_serve_pct,1st_serve_win_pct,2nd_serve_win_pct,bp_save_pct,SvGms
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Abedallah Shelbayh,19.792857,180.0,262.357143,4.142857,3.5,0.586241,0.6685,0.451588,0.567509,10.071429
Adam Moundir,27.9,191.0,33.0,1.0,4.0,0.513514,0.736842,0.5,0.6,11.0
Adam Neff,22.6,183.0,23.0,4.0,8.0,0.569444,0.585366,0.387097,0.636364,8.0
Adam Walton,25.115385,183.0,563.230769,5.692308,2.153846,0.64683,0.734747,0.50581,0.632613,12.846154
Adria Soriano Barrera,24.6,191.0,59.0,8.5,5.5,0.606285,0.757755,0.48538,0.708333,12.5


In [62]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(player_profiles)

similarity_matrix = cosine_similarity(X_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=player_profiles.index, columns=player_profiles.index)

similarity_df

player_name,Abedallah Shelbayh,Adam Moundir,Adam Neff,Adam Walton,Adria Soriano Barrera,Adrian Andreev,Adrian Mannarino,Ajeet Rai,Alan Fernando Rubio Fierros,Alastair Gray,...,Yuki Bhambri,Yunseong Chung,Yuta Shimizu,Zachary Svajda,Zdenek Kolar,Zhe Li,Zhizhen Zhang,Zizou Bergs,Zsombor Piros,Zura Tkemaladze
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abedallah Shelbayh,1.000000,0.206209,0.608676,-0.243545,0.066380,0.565212,-0.660404,0.399972,0.496163,0.246258,...,0.131890,-0.037613,0.577337,0.304143,0.247124,-0.167112,-0.630244,-0.188523,0.073412,0.273805
Adam Moundir,0.206209,1.000000,0.265254,-0.264187,0.225667,-0.185761,0.134949,0.382262,0.372827,0.275122,...,0.866968,-0.570040,0.163218,-0.392007,-0.071162,0.353118,-0.245829,-0.294343,-0.570350,-0.182501
Adam Neff,0.608676,0.265254,1.000000,-0.653621,0.314843,0.178875,-0.424089,0.504868,0.150600,0.578903,...,0.397568,-0.542334,0.432926,-0.309477,0.443044,-0.272089,-0.765307,-0.397857,-0.346493,0.337881
Adam Walton,-0.243545,-0.264187,-0.653621,1.000000,0.278439,-0.079252,0.059763,-0.641722,-0.028310,-0.097415,...,-0.330132,0.258000,-0.708842,0.395448,-0.447370,-0.386445,0.579347,0.813720,0.340335,-0.202739
Adria Soriano Barrera,0.066380,0.225667,0.314843,0.278439,1.000000,-0.488085,-0.260321,0.223315,0.038568,0.739647,...,0.261588,-0.657272,-0.557040,-0.306231,-0.222784,-0.509526,0.156736,0.598841,-0.504854,-0.329070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zhe Li,-0.167112,0.353118,-0.272089,-0.386445,-0.509526,-0.182146,0.504165,0.362190,0.379805,-0.573996,...,0.312385,0.233395,0.497768,-0.188208,-0.158221,1.000000,0.030832,-0.530848,-0.182313,-0.367661
Zhizhen Zhang,-0.630244,-0.245829,-0.765307,0.579347,0.156736,-0.533155,0.248172,-0.284465,-0.376374,-0.230782,...,-0.450948,0.132654,-0.727240,-0.054287,-0.593901,0.030832,1.000000,0.605234,0.045425,-0.471348
Zizou Bergs,-0.188523,-0.294343,-0.397857,0.813720,0.598841,-0.230454,-0.122697,-0.249293,-0.198805,0.190989,...,-0.402477,-0.014650,-0.724151,0.342339,-0.507929,-0.530848,0.605234,1.000000,0.101912,-0.394778
Zsombor Piros,0.073412,-0.570350,-0.346493,0.340335,-0.504854,0.741511,-0.194793,-0.621885,-0.286615,-0.557674,...,-0.685900,0.823202,0.058487,0.615692,-0.205664,-0.182313,0.045425,0.101912,1.000000,0.591478


In [58]:
def show_top_similar_players(player_name, top_n=10):
    if player_name not in similarity_df:
        print(f"⚠️ Player '{player_name}' not found.")
        return
    sims = similarity_df[player_name].sort_values(ascending=False)
    print(f"Top {top_n} most similar players to {player_name}:\n")
    print(sims[1:top_n+1])  # Skip self-match at index 0

In [59]:
show_top_similar_players('Novak Djokovic')

Top 10 most similar players to Novak Djokovic:

player_name
Rafael Nadal          0.991941
Dominic Thiem         0.983162
Daniil Medvedev       0.969686
Stefanos Tsitsipas    0.951721
Andrey Rublev         0.939914
Casper Ruud           0.925721
Roger Federer         0.924248
Alexander Zverev      0.919610
Cameron Norrie        0.912438
Carlos Alcaraz        0.901002
Name: Novak Djokovic, dtype: float64


In [60]:
show_top_similar_players('Carlos Alcaraz')

Top 10 most similar players to Carlos Alcaraz:

player_name
Casper Ruud           0.985018
Jannik Sinner         0.956477
Stefanos Tsitsipas    0.951689
Andrey Rublev         0.925948
Holger Rune           0.925431
Daniil Medvedev       0.916803
Cameron Norrie        0.908105
Dominic Thiem         0.903624
Novak Djokovic        0.901002
Alex De Minaur        0.900661
Name: Carlos Alcaraz, dtype: float64


In [61]:
show_top_similar_players('Rafael Nadal')

Top 10 most similar players to Rafael Nadal:

player_name
Novak Djokovic        0.991941
Dominic Thiem         0.975772
Daniil Medvedev       0.937787
Roger Federer         0.924637
Stefanos Tsitsipas    0.917649
Cameron Norrie        0.908534
Casper Ruud           0.905980
Andrey Rublev         0.897834
Carlos Alcaraz        0.882698
Alexander Zverev      0.879999
Name: Rafael Nadal, dtype: float64
