The goal of this notebook is to compile the various CSV files into a comprehendable CSV file for a machine learning model.

In [None]:
import pandas as pd

df_original = pd.read_parquet('../../data/cleaned/atp_matches_cleaned.parquet')
df_skills = pd.read_parquet('../../data/features/base/player_performance.parquet')
df_elo = pd.read_parquet('../../data/features/base/glicko2_ratings.parquet')
df_exp = pd.read_parquet('../../data/features/base/experience.parquet')
df_fatigue = pd.read_parquet('../../data/features/base/fatigue.parquet')
df_hth = pd.read_parquet('../../data/features/base/head_to_head.parquet')
df_mom = pd.read_parquet('../../data/features/base/momentum.parquet')

# concatenate all dataframes

dfs = [df_original, df_skills, df_elo, df_exp, df_fatigue, df_hth, df_mom]
df_full = pd.concat(dfs, axis=1)
df_full = df_full.loc[:, ~df_full.columns.duplicated()]
df_full.tail()

for col, dtype in df_full.dtypes.items():
    print(f"{col:<35} {dtype}")

surface                             object
draw_size                           int64
tourney_level                       object
tourney_date                        int64
id_a                                int64
name_a                              object
hand_a                              object
ht_a                                float64
age_a                               float64
id_b                                int64
name_b                              object
hand_b                              object
ht_b                                float64
age_b                               float64
score                               object
best_of                             int64
round                               object
minutes                             float64
ace_a                               float64
df_a                                float64
svpt_a                              float64
1stIn_a                             float64
1stWon_a                            float64
2ndWon

Rows not used as features in the model can be dropped. Features that encode information about the current game can also be dropped.

In [2]:
cols_to_drop = [
    'name_a','name_b','id_a','id_b','score','tourney_date','minutes',
    'ace_a','df_a','svpt_a','1stIn_a','1stWon_a','2ndWon_a','SvGms_a','bpSaved_a','bpFaced_a',
    'ace_b','df_b','svpt_b','1stIn_b','1stWon_b','2ndWon_b','SvGms_b','bpSaved_b','bpFaced_b'
]

df_full = df_full.drop(columns=cols_to_drop, errors='ignore')

One hot encoding is used on categorial features.

In [3]:
df_full = pd.get_dummies(df_full, drop_first=False)
for col, dtype in df_full.dtypes.items():
    print(f"{col:<35} {dtype}")

draw_size                           int64
ht_a                                float64
age_a                               float64
ht_b                                float64
age_b                               float64
best_of                             int64
rank_a                              float64
rank_points_a                       float64
rank_b                              float64
rank_points_b                       float64
result                              int64
p_ace_a                             float64
p_ace_b                             float64
p_df_a                              float64
p_df_b                              float64
p_1stIn_a                           float64
p_1stIn_b                           float64
p_1stWon_a                          float64
p_1stWon_b                          float64
p_2ndWon_a                          float64
p_2ndWon_b                          float64
p_2ndWon_inPlay_a                   float64
p_2ndWon_inPlay_b                   fl

Models work with relative strength, not absolute strength. To reduce the dimensionality of this dataset, we collapse all paired player features (those ending in _a and _b) into comparative features. This way, the model learns the matchup directly rather than treating each playerâ€™s raw stats as independent variables.

For most features, the difference is taken to get the corresponding comparative feature. However, for rank, the difference between inverse rank is used instead. This is because rank does not change linearly with "skill". Similarly, the log difference is taken for ATP ranking points, as ATP points scale exponentially.

In [4]:
import numpy as np
import pandas as pd

def build_matchup_features(df):
    df = df.copy()

    cols_to_drop = []

    if {'ht_a', 'ht_b'}.issubset(df.columns):
        df['height_diff'] = df['ht_a'] - df['ht_b']
        cols_to_drop += ['ht_a', 'ht_b']

    if {'age_a', 'age_b'}.issubset(df.columns):
        df['age_diff'] = df['age_a'] - df['age_b']
        cols_to_drop += ['age_a', 'age_b']

    rate_prefixes = [
        'elo',
        'elo_surface',
        'p_ace',
        'p_df',
        'p_1stIn',
        'p_1stWon',
        'p_2ndWon',
        'p_2ndWon_inPlay',
        'p_bpSaved',
        'p_rpw',
        'p_retAceAgainst',
        'p_ret1stWon',
        'p_ret2ndWon',
        'p_ret2ndWon_inPlay',
        'p_bpConv',
        'p_totalPtsWon',
        'dominance_ratio',
    ]

    for prefix in rate_prefixes:
        col_a = f'{prefix}_a'
        col_b = f'{prefix}_b'
        if col_a in df.columns and col_b in df.columns:
            df[f'{prefix}_diff'] = df[col_a] - df[col_b]
            cols_to_drop += [col_a, col_b]

    # rank: use inverse rank difference; handle zeros as NaN
    if {'rank_a', 'rank_b'}.issubset(df.columns):
        rank_a = df['rank_a'].replace(0, np.nan)
        rank_b = df['rank_b'].replace(0, np.nan)
        df['inv_rank_diff'] = (1.0 / rank_a) - (1.0 / rank_b)
        cols_to_drop += ['rank_a', 'rank_b']

    # rank points: use log1p difference
    if {'rank_points_a', 'rank_points_b'}.issubset(df.columns):
        rp_a = df['rank_points_a'].astype(float)
        rp_b = df['rank_points_b'].astype(float)
        df['log_rank_points_diff'] = np.log1p(rp_a) - np.log1p(rp_b)
        cols_to_drop += ['rank_points_a', 'rank_points_b']

    # drop originals
    cols_to_drop = [c for c in cols_to_drop if c in df.columns]
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)

    return df

In [6]:
df_balanced = build_matchup_features(df_full)
df_balanced = df_balanced[[col for col in df_balanced.columns if col != 'result'] + ['result']]

bool_cols = df_balanced.select_dtypes(include=['bool']).columns
df_balanced[bool_cols] = df_balanced[bool_cols].astype(int)

df_balanced.columns

Index(['draw_size', 'best_of', 'total_matches_a', 'total_matches_b',
       'total_surface_matches_a', 'total_surface_matches_b',
       'recent_matches_a', 'recent_matches_b', 'recent_minutes_a',
       'recent_minutes_b', 'hth_win_p_a', 'hth_matches', 'form_delta_a',
       'form_delta_b', 'elo_momentum_a', 'elo_momentum_b', 'surface_Carpet',
       'surface_Clay', 'surface_Grass', 'surface_Hard', 'tourney_level_A',
       'tourney_level_F', 'tourney_level_G', 'tourney_level_M', 'hand_a_A',
       'hand_a_L', 'hand_a_R', 'hand_a_U', 'hand_b_A', 'hand_b_L', 'hand_b_R',
       'hand_b_U', 'round_BR', 'round_ER', 'round_F', 'round_QF', 'round_R128',
       'round_R16', 'round_R32', 'round_R64', 'round_RR', 'round_SF',
       'height_diff', 'age_diff', 'elo_diff', 'elo_surface_diff', 'p_ace_diff',
       'p_df_diff', 'p_1stIn_diff', 'p_1stWon_diff', 'p_2ndWon_diff',
       'p_2ndWon_inPlay_diff', 'p_bpSaved_diff', 'p_rpw_diff',
       'p_retAceAgainst_diff', 'p_ret1stWon_diff', 'p_ret2nd