### 1. Importing the Data

Data extracted from Jeff Sackmann's tennis ATP database.

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# List of CSV file paths (example: if files are in the 'data' folder)
start_year = 1980
end_year = 2024
files = []

for year in range(start_year, end_year + 1):
    files.append('../../data/raw/tennis_atp-master/atp_matches_' + str(year) + '.csv')

# Initialize an empty list to store the DataFrames
dfs = []

# Loop through the files, read each one, and append to the list
for file in files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames in the list vertically (stacking rows)
df = pd.concat(dfs, axis=0, ignore_index=True)

In [8]:
print(df.head())

  tourney_id      tourney_name surface  draw_size tourney_level  tourney_date  \
0  1980-1725  Pepsi Grand Slam    Clay          4             A      19800208   
1  1980-1725  Pepsi Grand Slam    Clay          4             A      19800208   
2  1980-1725  Pepsi Grand Slam    Clay          4             A      19800208   
3   1980-205             Lagos    Clay         32             A      19800225   
4   1980-205             Lagos    Clay         32             A      19800225   

   match_num  winner_id  winner_seed winner_entry  ... l_1stIn l_1stWon  \
0          1     100437          NaN          NaN  ...     NaN      NaN   
1          2     100342          NaN          NaN  ...     NaN      NaN   
2          3     100437          NaN          NaN  ...     NaN      NaN   
3          1     100259          1.0          NaN  ...     NaN      NaN   
4          2     100135          NaN          NaN  ...     NaN      NaN   

   l_2ndWon l_SvGms  l_bpSaved  l_bpFaced  winner_rank winner_

### 2. Cleaning the Data
There are some rows with missing values:

In [9]:
print(df.isnull().sum())
print("\nTotal Rows: " + str(len(df)))

tourney_id                 0
tourney_name               0
surface                   80
draw_size                  0
tourney_level              0
tourney_date               0
match_num                  0
winner_id                  0
winner_seed            87260
winner_entry          131760
winner_name                0
winner_hand                0
winner_ht               3867
winner_ioc                 0
winner_age                90
loser_id                   0
loser_seed            115800
loser_entry           120717
loser_name                 0
loser_hand                 4
loser_ht                8112
loser_ioc                  0
loser_age                357
score                      4
best_of                    0
round                      0
minutes                53187
w_ace                  50239
w_df                   50239
w_svpt                 50239
w_1stIn                50239
w_1stWon               50239
w_2ndWon               50239
w_SvGms                50239
w_bpSaved     

Many features have missing values, notably the seed and entry features. Because of the inconsistent data, we will not include these features in our dataset.

Features that have no predictive power like match_num or tourney_id can also be dropped. Player id and name can be kept for identification purposes.

Also since I'm predicting ATP level matches, data below this level is just noise, and could affect elo calculations. ATP players mostly play ATP, and Challenger players mostly play Challenger players. For this reason, I will only include ATP level matches in my database.

In [10]:
# Not enough data
columns_to_drop = ['winner_seed', 'winner_entry', 'loser_seed', 'loser_entry']

# Randomised features with almost no predictive power
columns_to_drop += [
    'tourney_id', 'tourney_name', 'match_num', 'winner_ioc', 'loser_ioc'
]

for col in columns_to_drop:
    if col in df.columns:
            df.drop(col, axis=1, inplace=True)

# Keep ATP matches only
ATP_LEVELS = {"G", "M", "A", "F"}
df = df[df["tourney_level"].isin(ATP_LEVELS)].copy()

All remaining features can be found before a match, and do not introduce any look-ahead bias, except for score, which is the value we are attempting to predict. We can now remove rows with missing values.

In [11]:
df.dropna(axis=0, inplace=True)

This dataframe is still not very useful for us, because most features are categorised as winner/loser. To convert this to usable training data, we should instead look at arbitrary features of players A and B, where it is unclear from the start who is the winner/loser. To accomplish this, we can create a new dataframe, where player A is the winner 50% of the time, and player B is the winner the other 50% of the time.

To keep track of who is the winner, we should also add a result column. Result be 1 if player A wins, and 0 if player A loses.

In [12]:
# rename columns
df = df.rename(columns={
    # === Player A (Winner) ===
    'winner_id'          : 'id_a',
    'winner_name'        : 'name_a',
    'winner_hand'        : 'hand_a',
    'winner_ht'          : 'ht_a',
    'winner_age'         : 'age_a',
    'winner_rank'        : 'rank_a',
    'winner_rank_points' : 'rank_points_a',
    
    # Winner Match Stats → Player A
    'w_ace'      : 'ace_a',
    'w_df'       : 'df_a',
    'w_svpt'     : 'svpt_a',
    'w_1stIn'    : '1stIn_a',
    'w_1stWon'   : '1stWon_a',
    'w_2ndWon'   : '2ndWon_a',
    'w_SvGms'    : 'SvGms_a',
    'w_bpSaved'  : 'bpSaved_a',
    'w_bpFaced'  : 'bpFaced_a',

    # === Player B (Loser) ===
    'loser_id'          : 'id_b',
    'loser_name'        : 'name_b',
    'loser_hand'        : 'hand_b',
    'loser_ht'          : 'ht_b',
    'loser_age'         : 'age_b',
    'loser_rank'        : 'rank_b',
    'loser_rank_points' : 'rank_points_b',
    
    # Loser Match Stats → Player B
    'l_ace'      : 'ace_b',
    'l_df'       : 'df_b',
    'l_svpt'     : 'svpt_b',
    'l_1stIn'    : '1stIn_b',
    'l_1stWon'   : '1stWon_b',
    'l_2ndWon'   : '2ndWon_b',
    'l_SvGms'    : 'SvGms_b',
    'l_bpSaved'  : 'bpSaved_b',
    'l_bpFaced'  : 'bpFaced_b',
})

rng = np.random.default_rng(seed=123)

# p(swap) = 0.5
swap_mask = rng.random(len(df)) < 0.5 

a_cols = ['id_a', 'name_a', 'hand_a', 'ht_a', 'age_a', 'rank_a', 'rank_points_a']
b_cols = ['id_b', 'name_b', 'hand_b', 'ht_b', 'age_b', 'rank_b', 'rank_points_b']

# extract both sides
a_values = df.loc[swap_mask, a_cols].values  # original A
b_values = df.loc[swap_mask, b_cols].values  # original B

# swap
df.loc[swap_mask, a_cols] = b_values
df.loc[swap_mask, b_cols] = a_values

# no swap means player A is winner
df['result'] = 0
df.loc[~swap_mask, 'result'] = 1

print(df.head())

      surface  draw_size tourney_level  tourney_date    id_a  \
39976    Hard         32             A      19910107  101142   
39977    Hard         32             A      19910107  100587   
39978    Hard         32             A      19910107  101601   
39979    Hard         32             A      19910107  101332   
39980    Hard         32             A      19910107  101735   

                 name_a hand_a   ht_a  age_a    id_b  ... 1stWon_b 2ndWon_b  \
39976    Emilio Sanchez      R  180.0   25.6  101746  ...     17.0      7.0   
39977         Steve Guy      R  188.0   31.8  101613  ...     22.0      6.0   
39978      Brett Steven      R  185.0   21.6  101179  ...     24.0     14.0   
39979       Gilad Bloom      L  173.0   23.8  101117  ...     38.0     15.0   
39980  Richard Fromberg      R  196.0   20.6  101901  ...     21.0     12.0   

       SvGms_b  bpSaved_b bpFaced_b  rank_a rank_points_a  rank_b  \
39976      8.0        2.0       6.0     9.0        1487.0    78.0   
39

We can filter out incredibly short games which are statistically insignificant. This is achieved by counting the total serve points. If this is less than 10, most likely a retirement occurred.

We can also check for corrupted games, where reported statistics are nonsensical. 

In [13]:
def clean_matches(df: pd.DataFrame) -> pd.DataFrame:
    # 1. Basic svpt sanity: require at least 10 serve points for BOTH players
    mask_svpt = (df["svpt_a"] >= 10) & (df["svpt_b"] >= 10)

    # 2. Corruption checks for side a
    valid_a = (
        (df["svpt_a"] > 0) &
        (df["1stIn_a"] <= df["svpt_a"]) &
        (df["1stWon_a"] <= df["1stIn_a"]) &
        (df["2ndWon_a"] <= (df["svpt_a"] - df["1stIn_a"]))
    )

    # 3. Corruption checks for side b
    valid_b = (
        (df["svpt_b"] > 0) &
        (df["1stIn_b"] <= df["svpt_b"]) &
        (df["1stWon_b"] <= df["1stIn_b"]) &
        (df["2ndWon_b"] <= (df["svpt_b"] - df["1stIn_b"]))
    )

    # 4. Combine all conditions
    clean_mask = mask_svpt & valid_a & valid_b

    print(f"Dropping {len(df) - clean_mask.sum()} corrupted/short matches")

    return df[clean_mask].reset_index(drop=True)

df = clean_matches(df)

Dropping 134 corrupted/short matches
