## Load Data

In [8]:
import pandas as pd
import glob
import os

DATA_PATH = "../data/raw/"

# Load match files (2018–2024)
match_files = sorted(glob.glob(os.path.join(DATA_PATH, "atp_matches_20*.csv")))
print("Files loaded:")
for f in match_files:
    print(f)

dfs = []
for file in match_files:
    df = pd.read_csv(file)

    year = int(os.path.basename(file).split("_")[-1].split(".")[0]) # os.path.basename gets the file name from the path
    df["year"] = year                                               # split("_") returns a list ['atp', 'matches', '2018.csv'], and we take the last element [-1]
                                                                    # split(".") returns a list ['2018', 'csv'], and we take the first element [0], then convert it to int

    dfs.append(df)

matches = pd.concat(dfs, ignore_index=True)

print("\nTotal matches:", matches.shape[0])
print("Total columns:", matches.shape[1])

matches.head()

Files loaded:
../data/raw\atp_matches_2018.csv
../data/raw\atp_matches_2019.csv
../data/raw\atp_matches_2020.csv
../data/raw\atp_matches_2021.csv
../data/raw\atp_matches_2022.csv
../data/raw\atp_matches_2023.csv
../data/raw\atp_matches_2024.csv

Total matches: 18877
Total columns: 50


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,year
0,2018-M020,Brisbane,Hard,32,A,20180101,271,105992,,,...,33.0,19.0,14.0,1.0,4.0,47.0,1010.0,52.0,909.0,2018
1,2018-M020,Brisbane,Hard,32,A,20180101,272,111577,,,...,25.0,7.0,9.0,7.0,11.0,54.0,890.0,94.0,593.0,2018
2,2018-M020,Brisbane,Hard,32,A,20180101,273,104797,,,...,37.0,29.0,15.0,10.0,16.0,63.0,809.0,30.0,1391.0,2018
3,2018-M020,Brisbane,Hard,32,A,20180101,275,200282,,WC,...,33.0,17.0,11.0,4.0,6.0,208.0,245.0,44.0,1055.0,2018
4,2018-M020,Brisbane,Hard,32,A,20180101,276,111581,,Q,...,28.0,5.0,9.0,0.0,2.0,175.0,299.0,68.0,755.0,2018


## Quick Inspection

In [17]:
matches.info()

matches.describe(include="all").T

matches.columns.tolist()[20:]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18877 entries, 0 to 18876
Data columns (total 50 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_id          18877 non-null  object 
 1   tourney_name        18877 non-null  object 
 2   surface             18824 non-null  object 
 3   draw_size           18877 non-null  int64  
 4   tourney_level       18877 non-null  object 
 5   tourney_date        18877 non-null  int64  
 6   match_num           18877 non-null  int64  
 7   winner_id           18877 non-null  int64  
 8   winner_seed         7773 non-null   float64
 9   winner_entry        2885 non-null   object 
 10  winner_name         18877 non-null  object 
 11  winner_hand         18877 non-null  object 
 12  winner_ht           18798 non-null  float64
 13  winner_ioc          18877 non-null  object 
 14  winner_age          18874 non-null  float64
 15  loser_id            18877 non-null  int64  
 16  lose

['loser_ht',
 'loser_ioc',
 'loser_age',
 'score',
 'best_of',
 'round',
 'minutes',
 'w_ace',
 'w_df',
 'w_svpt',
 'w_1stIn',
 'w_1stWon',
 'w_2ndWon',
 'w_SvGms',
 'w_bpSaved',
 'w_bpFaced',
 'l_ace',
 'l_df',
 'l_svpt',
 'l_1stIn',
 'l_1stWon',
 'l_2ndWon',
 'l_SvGms',
 'l_bpSaved',
 'l_bpFaced',
 'winner_rank',
 'winner_rank_points',
 'loser_rank',
 'loser_rank_points',
 'year']

In [29]:
selected_cols = [
    "year",
    "tourney_date",
    "surface",
    "tourney_level",
    
    "winner_id",
    "winner_name",
    "winner_rank",
    "winner_age",
    "winner_ht",
    
    "loser_id",
    "loser_name",
    "loser_rank",
    "loser_age",
    "loser_ht",
    
    "w_ace", "w_df", "w_bpSaved", "w_bpFaced",
    "l_ace", "l_df", "l_bpSaved", "l_bpFaced"
]

matches = matches[selected_cols]
matches.head()
matches.shape


(18877, 22)

In [26]:
missing_ratio = matches.isna().mean().sort_values(ascending=False)
missing_ratio[missing_ratio > 0]


w_bpSaved      0.036764
w_bpFaced      0.036764
l_ace          0.036764
l_df           0.036764
l_bpSaved      0.036764
l_bpFaced      0.036764
w_df           0.036764
w_ace          0.036764
loser_ht       0.012025
loser_rank     0.011760
winner_ht      0.004185
winner_rank    0.004185
surface        0.002808
winner_age     0.000159
loser_age      0.000159
dtype: float64

In [28]:
os.makedirs("../data/processed", exist_ok=True)
matches.to_csv("../data/processed/matches_base.csv", index=False)