In [1]:
pip install pandas matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.


# Here is the Data Raw

In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2024.csv"
df_2024 = pd.read_csv(url)
print(df_2024.head())


  tourney_id tourney_name surface  draw_size tourney_level  tourney_date  \
0  2024-0339     Brisbane    Hard         32             A      20240101   
1  2024-0339     Brisbane    Hard         32             A      20240101   
2  2024-0339     Brisbane    Hard         32             A      20240101   
3  2024-0339     Brisbane    Hard         32             A      20240101   
4  2024-0339     Brisbane    Hard         32             A      20240101   

   match_num  winner_id  winner_seed winner_entry  ... l_1stIn l_1stWon  \
0        300     105777          2.0          NaN  ...    58.0     44.0   
1        299     208029          1.0          NaN  ...    35.0     31.0   
2        298     105777          2.0          NaN  ...    39.0     24.0   
3        297     208029          1.0          NaN  ...    51.0     31.0   
4        296     126128          NaN          NaN  ...    37.0     27.0   

   l_2ndWon l_SvGms  l_bpSaved  l_bpFaced  winner_rank winner_rank_points  \
0      16.0    

# This is the data all compiled into one.

In [3]:
years = range(2020, 2025)
all_matches = pd.concat([
    pd.read_csv(f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv")
    for year in years
], ignore_index=True)

print(all_matches.shape)


(13174, 49)


# This is the company data all listed out.

In [4]:
# 🔹 View column names
print("🔹 Columns:")
print(all_matches.columns)
print("-" * 50)

# 🔹 View data types of each column
print("🔹 Data Types:")
print(all_matches.dtypes)
print("-" * 50)

# 🔹 Count how many matches were played on each surface
print("🔹 Match Counts by Surface:")
print(all_matches['surface'].value_counts())
print("-" * 50)


🔹 Columns:
Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')
--------------------------------------------------
🔹 Data Types:
tourney_id             object
tourney_name           object
surface                object
draw_size               int64
tourney_level          object
tourney_date            int64
match_num               in

# I want to clean all data.

In [5]:
# View the top 5 rows
print("🔼 Top of the DataFrame:")
print(all_matches.head())

# View the bottom 5 rows
print("\n🔽 Bottom of the DataFrame:")
print(all_matches.tail())


🔼 Top of the DataFrame:
  tourney_id tourney_name surface  draw_size tourney_level  tourney_date  \
0  2020-8888      Atp Cup    Hard         24             A      20200106   
1  2020-8888      Atp Cup    Hard         24             A      20200106   
2  2020-8888      Atp Cup    Hard         24             A      20200106   
3  2020-8888      Atp Cup    Hard         24             A      20200106   
4  2020-8888      Atp Cup    Hard         24             A      20200106   

   match_num  winner_id  winner_seed winner_entry  ... l_1stIn l_1stWon  \
0        300     104925          NaN          NaN  ...    51.0     39.0   
1        299     105138          NaN          NaN  ...    35.0     21.0   
2        298     104925          NaN          NaN  ...    57.0     35.0   
3        297     105583          NaN          NaN  ...    54.0     39.0   
4        296     104745          NaN          NaN  ...    55.0     37.0   

   l_2ndWon l_SvGms  l_bpSaved  l_bpFaced  winner_rank winner_rank_p

# Drop Columns I think arent worthy

In [6]:
#cols_to_drop = [
    #'tourney_id', 'tourney_name', 'match_num', 'winner_id', 'loser_id',
   #'winner_name', 'loser_name', 'score', 'winner_entry', 'loser_entry',
    #'tourney_date', 'draw_size'  # optionally drop or transform
#]


In [7]:
#all_matches1= all_matches.drop(columns=cols_to_drop)


In [8]:
# View the top 5 rows
#print("🔼 Top of the DataFrame:")
#print(all_matches1.head())

# View the bottom 5 rows
#print("\n🔽 Bottom of the DataFrame:")
#print(all_matches1.tail())

# Use a dropna

In [9]:
columns_to_check = [
    'winner_ht', 'loser_ht',
    'winner_age', 'loser_age',
    'w_ace', 'l_ace', 'w_df', 'l_df',
    'w_svpt', 'l_svpt', 'w_1stIn', 'l_1stIn',
    'w_1stWon', 'l_1stWon', 'w_2ndWon', 'l_2ndWon',
    'w_SvGms', 'l_SvGms', 'w_bpSaved', 'l_bpSaved',
    'w_bpFaced', 'l_bpFaced',
    'winner_rank', 'loser_rank',
    'winner_rank_points', 'loser_rank_points',
    'surface'
]

# Drop rows that have NaNs in any of these important columns
all_matches_filtered = all_matches.dropna(subset=columns_to_check)

# Optional: Reset the index after dropping
all_matches_filtered = all_matches_filtered.reset_index(drop=True)

# Preview cleaned data
print("✅ Cleaned DataFrame shape:", all_matches_filtered.shape)
print(all_matches_filtered.head())


✅ Cleaned DataFrame shape: (12480, 49)
  tourney_id tourney_name surface  draw_size tourney_level  tourney_date  \
0  2020-8888      Atp Cup    Hard         24             A      20200106   
1  2020-8888      Atp Cup    Hard         24             A      20200106   
2  2020-8888      Atp Cup    Hard         24             A      20200106   
3  2020-8888      Atp Cup    Hard         24             A      20200106   
4  2020-8888      Atp Cup    Hard         24             A      20200106   

   match_num  winner_id  winner_seed winner_entry  ... l_1stIn l_1stWon  \
0        300     104925          NaN          NaN  ...    51.0     39.0   
1        299     105138          NaN          NaN  ...    35.0     21.0   
2        298     104925          NaN          NaN  ...    57.0     35.0   
3        297     105583          NaN          NaN  ...    54.0     39.0   
4        296     104745          NaN          NaN  ...    55.0     37.0   

   l_2ndWon l_SvGms  l_bpSaved  l_bpFaced  winner_ran

#### ✅ Features Created:

- **WINNER_ID / LOSER_ID**: Player identifiers (useful for grouping or further analysis).
- **ATP_POINT_DIFF**: Difference in ATP ranking points between winner and loser.
- **ATP_RANK_DIFF**: Difference in ATP rankings (lower rank = stronger player).
- **AGE_DIFF**: Age gap between winner and loser.
- **HEIGHT_DIFF**: Difference in height (can impact serve performance).
- **MATCH_MINUTES**: Duration of the match (may correlate with match competitiveness).
- **SURFACE**: Type of court surface (e.g., Hard, Clay, Grass) — a key contextual factor.
- **TOURNEY_LEVEL**: Tournament level (e.g., Grand Slam, ATP 250, Challenger).
- **DRAW_SIZE**: Size of the tournament draw (e.g., 32, 64, 128).


In [10]:
final_data = pd.DataFrame({
    "WINNER_ID": all_matches_filtered["winner_id"],
    "LOSER_ID": all_matches_filtered["loser_id"],
    
    "ATP_POINT_DIFF": all_matches_filtered["winner_rank_points"] - all_matches_filtered["loser_rank_points"],
    "ATP_RANK_DIFF":  all_matches_filtered["winner_rank"] - all_matches_filtered["loser_rank"],
    
    "AGE_DIFF":        all_matches_filtered["winner_age"] - all_matches_filtered["loser_age"],
    "HEIGHT_DIFF":     all_matches_filtered["winner_ht"] - all_matches_filtered["loser_ht"],
    
    "MATCH_MINUTES":   all_matches_filtered["minutes"],
    "SURFACE":         all_matches_filtered["surface"],
    "TOURNEY_LEVEL":   all_matches_filtered["tourney_level"],
    "DRAW_SIZE":       all_matches_filtered["draw_size"],
    "TOURNEY_ID":   all_matches_filtered["tourney_id"] 
})


In [11]:
final_data

Unnamed: 0,WINNER_ID,LOSER_ID,ATP_POINT_DIFF,ATP_RANK_DIFF,AGE_DIFF,HEIGHT_DIFF,MATCH_MINUTES,SURFACE,TOURNEY_LEVEL,DRAW_SIZE,TOURNEY_ID
0,104925,104745,-930.0,1.0,-0.9,3.0,115.0,Hard,A,24,2020-8888
1,105138,105583,1084.0,-24.0,2.2,0.0,97.0,Hard,A,24,2020-8888
2,104925,106421,3350.0,-3.0,8.7,-10.0,167.0,Hard,A,24,2020-8888
3,105583,111575,-589.0,17.0,5.9,-15.0,108.0,Hard,A,24,2020-8888
4,104745,200282,8210.0,-17.0,12.7,2.0,133.0,Hard,A,24,2020-8888
...,...,...,...,...,...,...,...,...,...,...,...
12475,207134,133933,40.0,-250.0,-3.2,-8.0,87.0,Hard,D,4,2024-M-DC-2024-WG2-PO-TOG-INA-01
12476,121411,132374,187.0,-621.0,2.2,-10.0,65.0,Hard,D,4,2024-M-DC-2024-WG2-PO-TUN-CRC-01
12477,208364,209943,21.0,-124.0,2.1,-3.0,137.0,Clay,D,4,2024-M-DC-2024-WG2-PO-URU-MDA-01
12478,105430,208364,434.0,-480.0,10.3,-10.0,95.0,Clay,D,4,2024-M-DC-2024-WG2-PO-URU-MDA-01


# Head to Head and Other stats

In [12]:
from collections import defaultdict
from tqdm import tqdm

# Initialize H2H tracking dictionaries
h2h_dict = defaultdict(int)
h2h_surface_dict = defaultdict(lambda: defaultdict(int))

# Output lists for storing feature values
total_h2h = []
total_h2h_surface = []

# Loop over all matches
for idx, (w_id, l_id, surface) in enumerate(tqdm(
        zip(all_matches_filtered['winner_id'], 
            all_matches_filtered['loser_id'], 
            all_matches_filtered['surface']),
        total=len(all_matches_filtered),
        desc="Calculating H2H features")):

    # Total H2H record before this match
    wins = h2h_dict[(w_id, l_id)]
    losses = h2h_dict[(l_id, w_id)]
    total_h2h.append(wins - losses)

    # Surface-specific H2H
    wins_surface = h2h_surface_dict[surface][(w_id, l_id)]
    losses_surface = h2h_surface_dict[surface][(l_id, w_id)]
    total_h2h_surface.append(wins_surface - losses_surface)

    # Update H2H dictionaries after match
    h2h_dict[(w_id, l_id)] += 1
    h2h_surface_dict[surface][(w_id, l_id)] += 1

# Assign features to final_data
final_data["H2H_DIFF"] = total_h2h
final_data["H2H_SURFACE_DIFF"] = total_h2h_surface


Calculating H2H features: 100%|██████████| 12480/12480 [00:00<00:00, 318535.35it/s]


In [13]:
from collections import defaultdict
import numpy as np
from tqdm import tqdm

# ---- SETTINGS ----
K_VALUES_STATS = [3, 5, 10, 20, 50, 100, 200, 300, 2000]  # serve/return metrics
K_VALUES_WINRATE = [5, 10, 20, 50]  # win rate metrics

# ---- INIT DATA ----
# Player performance history
history_stats = defaultdict(lambda: defaultdict(list))  # player_id -> metric -> list
# Player win/loss history
history_results = defaultdict(list)

# ---- BASE FEATURES ----
# BEST_OF from tourney_level
final_data["BEST_OF"] = all_matches_filtered["tourney_level"].apply(
    lambda x: 5 if x in ["G", "D"] else 3
)

# Prepare storage for features
for K in K_VALUES_STATS:
    for stat in ["P_ACE", "P_DF", "P_1ST_IN", "P_1ST_WON", "P_2ND_WON", "P_BP_SAVED"]:
        final_data[f"{stat}_LAST_{K}_DIFF"] = np.nan
for K in K_VALUES_WINRATE:
    final_data[f"WIN_LAST_{K}_DIFF"] = np.nan

# ---- MAIN LOOP ----
for idx, row in tqdm(all_matches_filtered.iterrows(), total=len(all_matches_filtered), desc="Building features"):
    w_id, l_id = row.winner_id, row.loser_id

    # 1️⃣ --- WIN RATE FEATURES ---
    for K in K_VALUES_WINRATE:
        w_winrate = np.mean(history_results[w_id][-K:]) if history_results[w_id] else np.nan
        l_winrate = np.mean(history_results[l_id][-K:]) if history_results[l_id] else np.nan
        final_data.at[idx, f"WIN_LAST_{K}_DIFF"] = w_winrate - l_winrate

    # 2️⃣ --- SERVE/RETURN ROLLING FEATURES ---
    for K in K_VALUES_STATS:
        def safe_mean(lst): return np.mean(lst[-K:]) if lst else np.nan
        final_data.at[idx, f"P_ACE_LAST_{K}_DIFF"] = safe_mean(history_stats[w_id]["p_ace"]) - safe_mean(history_stats[l_id]["p_ace"])
        final_data.at[idx, f"P_DF_LAST_{K}_DIFF"] = safe_mean(history_stats[w_id]["p_df"]) - safe_mean(history_stats[l_id]["p_df"])
        final_data.at[idx, f"P_1ST_IN_LAST_{K}_DIFF"] = safe_mean(history_stats[w_id]["p_1stIn"]) - safe_mean(history_stats[l_id]["p_1stIn"])
        final_data.at[idx, f"P_1ST_WON_LAST_{K}_DIFF"] = safe_mean(history_stats[w_id]["p_1stWon"]) - safe_mean(history_stats[l_id]["p_1stWon"])
        final_data.at[idx, f"P_2ND_WON_LAST_{K}_DIFF"] = safe_mean(history_stats[w_id]["p_2ndWon"]) - safe_mean(history_stats[l_id]["p_2ndWon"])
        final_data.at[idx, f"P_BP_SAVED_LAST_{K}_DIFF"] = safe_mean(history_stats[w_id]["p_bpSaved"]) - safe_mean(history_stats[l_id]["p_bpSaved"])

    # 3️⃣ --- UPDATE HISTORIES ---
    # Winner stats
    if row.w_svpt > 0:
        history_stats[w_id]["p_ace"].append(100 * row.w_ace / row.w_svpt)
        history_stats[w_id]["p_df"].append(100 * row.w_df / row.w_svpt)
        history_stats[w_id]["p_1stIn"].append(100 * row.w_1stIn / row.w_svpt)
        if row.w_1stIn > 0:
            history_stats[w_id]["p_1stWon"].append(100 * row.w_1stWon / row.w_1stIn)
        if row.w_svpt - row.w_1stIn > 0:
            history_stats[w_id]["p_2ndWon"].append(100 * row.w_2ndWon / (row.w_svpt - row.w_1stIn))
        if row.w_bpFaced > 0:
            history_stats[w_id]["p_bpSaved"].append(100 * row.w_bpSaved / row.w_bpFaced)

    # Loser stats
    if row.l_svpt > 0:
        history_stats[l_id]["p_ace"].append(100 * row.l_ace / row.l_svpt)
        history_stats[l_id]["p_df"].append(100 * row.l_df / row.l_svpt)
        history_stats[l_id]["p_1stIn"].append(100 * row.l_1stIn / row.l_svpt)
        if row.l_1stIn > 0:
            history_stats[l_id]["p_1stWon"].append(100 * row.l_1stWon / row.l_1stIn)
        if row.l_svpt - row.l_1stIn > 0:
            history_stats[l_id]["p_2ndWon"].append(100 * row.l_2ndWon / (row.l_svpt - row.l_1stIn))
        if row.l_bpFaced > 0:
            history_stats[l_id]["p_bpSaved"].append(100 * row.l_bpSaved / row.l_bpFaced)

    # Win/loss history update
    history_results[w_id].append(1)
    history_results[l_id].append(0)


Building features: 100%|██████████| 12480/12480 [00:26<00:00, 465.93it/s]


In [14]:
final_data

Unnamed: 0,WINNER_ID,LOSER_ID,ATP_POINT_DIFF,ATP_RANK_DIFF,AGE_DIFF,HEIGHT_DIFF,MATCH_MINUTES,SURFACE,TOURNEY_LEVEL,DRAW_SIZE,...,P_ACE_LAST_2000_DIFF,P_DF_LAST_2000_DIFF,P_1ST_IN_LAST_2000_DIFF,P_1ST_WON_LAST_2000_DIFF,P_2ND_WON_LAST_2000_DIFF,P_BP_SAVED_LAST_2000_DIFF,WIN_LAST_5_DIFF,WIN_LAST_10_DIFF,WIN_LAST_20_DIFF,WIN_LAST_50_DIFF
0,104925,104745,-930.0,1.0,-0.9,3.0,115.0,Hard,A,24,...,,,,,,,,,,
1,105138,105583,1084.0,-24.0,2.2,0.0,97.0,Hard,A,24,...,,,,,,,,,,
2,104925,106421,3350.0,-3.0,8.7,-10.0,167.0,Hard,A,24,...,,,,,,,,,,
3,105583,111575,-589.0,17.0,5.9,-15.0,108.0,Hard,A,24,...,,,,,,,,,,
4,104745,200282,8210.0,-17.0,12.7,2.0,133.0,Hard,A,24,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12475,207134,133933,40.0,-250.0,-3.2,-8.0,87.0,Hard,D,4,...,,,,,,,,,,
12476,121411,132374,187.0,-621.0,2.2,-10.0,65.0,Hard,D,4,...,,,,,,,,,,
12477,208364,209943,21.0,-124.0,2.1,-3.0,137.0,Clay,D,4,...,,,,,,,,,,
12478,105430,208364,434.0,-480.0,10.3,-10.0,95.0,Clay,D,4,...,2.043732,1.164348,-8.176585,2.366151,-2.73464,-10.701242,-0.8,-0.7,-0.7,-0.62


# PLOT

In [15]:
import seaborn as sns

In [16]:
#sns.pairplot(final_data)

# This was to check the different columns and make sure nothing was missing 

In [17]:
print("🔹 Columns:")
print(final_data.columns)
print("-" * 50)

🔹 Columns:
Index(['WINNER_ID', 'LOSER_ID', 'ATP_POINT_DIFF', 'ATP_RANK_DIFF', 'AGE_DIFF',
       'HEIGHT_DIFF', 'MATCH_MINUTES', 'SURFACE', 'TOURNEY_LEVEL', 'DRAW_SIZE',
       'TOURNEY_ID', 'H2H_DIFF', 'H2H_SURFACE_DIFF', 'BEST_OF',
       'P_ACE_LAST_3_DIFF', 'P_DF_LAST_3_DIFF', 'P_1ST_IN_LAST_3_DIFF',
       'P_1ST_WON_LAST_3_DIFF', 'P_2ND_WON_LAST_3_DIFF',
       'P_BP_SAVED_LAST_3_DIFF', 'P_ACE_LAST_5_DIFF', 'P_DF_LAST_5_DIFF',
       'P_1ST_IN_LAST_5_DIFF', 'P_1ST_WON_LAST_5_DIFF',
       'P_2ND_WON_LAST_5_DIFF', 'P_BP_SAVED_LAST_5_DIFF', 'P_ACE_LAST_10_DIFF',
       'P_DF_LAST_10_DIFF', 'P_1ST_IN_LAST_10_DIFF', 'P_1ST_WON_LAST_10_DIFF',
       'P_2ND_WON_LAST_10_DIFF', 'P_BP_SAVED_LAST_10_DIFF',
       'P_ACE_LAST_20_DIFF', 'P_DF_LAST_20_DIFF', 'P_1ST_IN_LAST_20_DIFF',
       'P_1ST_WON_LAST_20_DIFF', 'P_2ND_WON_LAST_20_DIFF',
       'P_BP_SAVED_LAST_20_DIFF', 'P_ACE_LAST_50_DIFF', 'P_DF_LAST_50_DIFF',
       'P_1ST_IN_LAST_50_DIFF', 'P_1ST_WON_LAST_50_DIFF',
       'P_2ND_WON_L

# Making sure that Double Faults are added back

In [18]:
# Calculate DF_GAMES_DIFF: proxy for DF per service game
df_games_winner = all_matches_filtered["w_df"] / all_matches_filtered["w_SvGms"]
df_games_loser = all_matches_filtered["l_df"] / all_matches_filtered["l_SvGms"]

# Handle division by zero or missing values
df_games_winner = df_games_winner.replace([np.inf, -np.inf], np.nan).fillna(0)
df_games_loser = df_games_loser.replace([np.inf, -np.inf], np.nan).fillna(0)

# Difference between winner and loser
final_data["DIFF_N_GAMES"] = df_games_winner - df_games_loser


In [19]:
print("🔹 Columns:")
print(final_data.columns)
print("-" * 50)

🔹 Columns:
Index(['WINNER_ID', 'LOSER_ID', 'ATP_POINT_DIFF', 'ATP_RANK_DIFF', 'AGE_DIFF',
       'HEIGHT_DIFF', 'MATCH_MINUTES', 'SURFACE', 'TOURNEY_LEVEL', 'DRAW_SIZE',
       'TOURNEY_ID', 'H2H_DIFF', 'H2H_SURFACE_DIFF', 'BEST_OF',
       'P_ACE_LAST_3_DIFF', 'P_DF_LAST_3_DIFF', 'P_1ST_IN_LAST_3_DIFF',
       'P_1ST_WON_LAST_3_DIFF', 'P_2ND_WON_LAST_3_DIFF',
       'P_BP_SAVED_LAST_3_DIFF', 'P_ACE_LAST_5_DIFF', 'P_DF_LAST_5_DIFF',
       'P_1ST_IN_LAST_5_DIFF', 'P_1ST_WON_LAST_5_DIFF',
       'P_2ND_WON_LAST_5_DIFF', 'P_BP_SAVED_LAST_5_DIFF', 'P_ACE_LAST_10_DIFF',
       'P_DF_LAST_10_DIFF', 'P_1ST_IN_LAST_10_DIFF', 'P_1ST_WON_LAST_10_DIFF',
       'P_2ND_WON_LAST_10_DIFF', 'P_BP_SAVED_LAST_10_DIFF',
       'P_ACE_LAST_20_DIFF', 'P_DF_LAST_20_DIFF', 'P_1ST_IN_LAST_20_DIFF',
       'P_1ST_WON_LAST_20_DIFF', 'P_2ND_WON_LAST_20_DIFF',
       'P_BP_SAVED_LAST_20_DIFF', 'P_ACE_LAST_50_DIFF', 'P_DF_LAST_50_DIFF',
       'P_1ST_IN_LAST_50_DIFF', 'P_1ST_WON_LAST_50_DIFF',
       'P_2ND_WON_L

# ELO


In [20]:
# =====================================
# ELO CALCULATION (LEAK-FREE, MERGED INTO final_data)
# =====================================

import pandas as pd
from collections import defaultdict

# --- Parameters ---
BASE_RATING = 1500
K_FACTORS = [5, 10, 20, 100, 200, 500]   # multi-K Elo factors (overall + surface) -- exclude 50 here
K_SURFACE = 50                           # single surface Elo factor

# --- Add stable row key to matches and final_data ---
all_matches_filtered = all_matches_filtered.copy()
all_matches_filtered["ROW_ID"] = all_matches_filtered.index
all_matches_filtered["tourney_date"] = pd.to_datetime(
    all_matches_filtered["tourney_date"], format="%Y%m%d"
)

final_data = final_data.copy()
final_data["ROW_ID"] = all_matches_filtered["ROW_ID"].values

# --- Sort chronologically for Elo computation ---
matches_sorted = all_matches_filtered.sort_values("tourney_date").reset_index(drop=True)

# --- Elo storage ---
elo_overall = {K: defaultdict(lambda: BASE_RATING) for K in K_FACTORS}
elo_surface = {K: defaultdict(lambda: BASE_RATING) for K in K_FACTORS + [K_SURFACE]}

# --- Store pre-match diffs ---
elo_grad_diffs = {K: [] for K in K_FACTORS}
elo_grad_surface_diffs = {K: [] for K in K_FACTORS}
elo_surface_simple_diff = []  # K_SURFACE only

# --- Loop over matches in time order ---
for _, row in matches_sorted.iterrows():
    w, l = row.winner_id, row.loser_id
    sfc = row.surface

    # Overall Elo (multi-K)
    for K in K_FACTORS:
        rw, rl = elo_overall[K][w], elo_overall[K][l]
        elo_grad_diffs[K].append(rw - rl)  # pre-match diff
        Ew = 1 / (1 + 10 ** ((rl - rw) / 400))
        elo_overall[K][w] = rw + K * (1 - Ew)
        elo_overall[K][l] = rl - K * (Ew)

    # Surface Elo (multi-K)
    for K in K_FACTORS:
        rw, rl = elo_surface[K][(w, sfc)], elo_surface[K][(l, sfc)]
        elo_grad_surface_diffs[K].append(rw - rl)  # pre-match diff
        Ew = 1 / (1 + 10 ** ((rl - rw) / 400))
        elo_surface[K][(w, sfc)] = rw + K * (1 - Ew)
        elo_surface[K][(l, sfc)] = rl - K * (Ew)

    # Single-K surface Elo
    rw, rl = elo_surface[K_SURFACE][(w, sfc)], elo_surface[K_SURFACE][(l, sfc)]
    elo_surface_simple_diff.append(rw - rl)  # pre-match diff
    Ew = 1 / (1 + 10 ** ((rl - rw) / 400))
    elo_surface[K_SURFACE][(w, sfc)] = rw + K_SURFACE * (1 - Ew)
    elo_surface[K_SURFACE][(l, sfc)] = rl - K_SURFACE * (Ew)

# --- Attach Elo columns to matches_sorted ---
for K in K_FACTORS:
    matches_sorted[f"ELO_GRAD_{K}_DIFF"]  = elo_grad_diffs[K]
    matches_sorted[f"ELO_GRAD_{K}S_DIFF"] = elo_grad_surface_diffs[K]
matches_sorted["ELO_SURFACE_DIFF"] = elo_surface_simple_diff

# --- Merge Elo features into final_data by ROW_ID ---
elo_cols = [c for c in matches_sorted.columns if c.startswith("ELO_")]
elo_df = matches_sorted[["ROW_ID"] + elo_cols]

# Ensure keys match up
assert final_data["ROW_ID"].is_unique and elo_df["ROW_ID"].is_unique

final_data = final_data.merge(elo_df, on="ROW_ID", how="left")
print("Elo features added:", elo_cols)

# --- Sanity checks ---
assert len(final_data) == len(all_matches_filtered)
print("Missing Elo values:", final_data.filter(like="ELO_").isna().sum().sum())


Elo features added: ['ELO_GRAD_5_DIFF', 'ELO_GRAD_5S_DIFF', 'ELO_GRAD_10_DIFF', 'ELO_GRAD_10S_DIFF', 'ELO_GRAD_20_DIFF', 'ELO_GRAD_20S_DIFF', 'ELO_GRAD_100_DIFF', 'ELO_GRAD_100S_DIFF', 'ELO_GRAD_200_DIFF', 'ELO_GRAD_200S_DIFF', 'ELO_GRAD_500_DIFF', 'ELO_GRAD_500S_DIFF', 'ELO_SURFACE_DIFF']
Missing Elo values: 0


# xgboost


In [21]:
import pandas as pd

assert "final_data" in globals(), "final_data is not defined in this notebook."
print(final_data.shape)
final_data.head()


(12480, 87)


Unnamed: 0,WINNER_ID,LOSER_ID,ATP_POINT_DIFF,ATP_RANK_DIFF,AGE_DIFF,HEIGHT_DIFF,MATCH_MINUTES,SURFACE,TOURNEY_LEVEL,DRAW_SIZE,...,ELO_GRAD_10S_DIFF,ELO_GRAD_20_DIFF,ELO_GRAD_20S_DIFF,ELO_GRAD_100_DIFF,ELO_GRAD_100S_DIFF,ELO_GRAD_200_DIFF,ELO_GRAD_200S_DIFF,ELO_GRAD_500_DIFF,ELO_GRAD_500S_DIFF,ELO_SURFACE_DIFF
0,104925,104745,-930.0,1.0,-0.9,3.0,115.0,Hard,A,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,105138,105583,1084.0,-24.0,2.2,0.0,97.0,Hard,A,24,...,-9.29617,-17.248855,-17.248855,-44.226866,-44.226866,-34.386835,-34.386835,-85.370082,-85.370082,-34.032647
2,104925,106421,3350.0,-3.0,8.7,-10.0,167.0,Hard,A,24,...,19.503715,38.045542,38.045542,157.540792,157.540792,257.76018,257.76018,425.918492,425.918492,88.383563
3,105583,111575,-589.0,17.0,5.9,-15.0,108.0,Hard,A,24,...,9.498574,18.004831,18.004831,53.250338,53.250338,34.872008,34.872008,-101.321817,-101.321817,37.776118
4,104745,200282,8210.0,-17.0,12.7,2.0,133.0,Hard,A,24,...,-15.142837,-30.566756,-30.566756,-163.05341,-163.05341,-345.819036,-345.819036,-939.915025,-939.915025,-78.446485


In [22]:
import pandas as pd
import numpy as np

df = final_data.copy()
df["TARGET"] = 1  # winner perspective = win

# Create flipped rows (loser perspective)
flipped = df.copy()

# Swap IDs
if {"WINNER_ID","LOSER_ID"}.issubset(df.columns):
    flipped["WINNER_ID"], flipped["LOSER_ID"] = df["LOSER_ID"], df["WINNER_ID"]

# Flip all *_DIFF signs
diff_cols = [c for c in df.columns if c.endswith("_DIFF")]
flipped[diff_cols] = -df[diff_cols].values

# Label flipped as losses
flipped["TARGET"] = 0

# Combine
full_data = pd.concat([df, flipped], ignore_index=True)

print("Class balance:", full_data["TARGET"].value_counts().to_dict())
full_data.head()


Class balance: {1: 12480, 0: 12480}


Unnamed: 0,WINNER_ID,LOSER_ID,ATP_POINT_DIFF,ATP_RANK_DIFF,AGE_DIFF,HEIGHT_DIFF,MATCH_MINUTES,SURFACE,TOURNEY_LEVEL,DRAW_SIZE,...,ELO_GRAD_20_DIFF,ELO_GRAD_20S_DIFF,ELO_GRAD_100_DIFF,ELO_GRAD_100S_DIFF,ELO_GRAD_200_DIFF,ELO_GRAD_200S_DIFF,ELO_GRAD_500_DIFF,ELO_GRAD_500S_DIFF,ELO_SURFACE_DIFF,TARGET
0,104925,104745,-930.0,1.0,-0.9,3.0,115.0,Hard,A,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,105138,105583,1084.0,-24.0,2.2,0.0,97.0,Hard,A,24,...,-17.248855,-17.248855,-44.226866,-44.226866,-34.386835,-34.386835,-85.370082,-85.370082,-34.032647,1
2,104925,106421,3350.0,-3.0,8.7,-10.0,167.0,Hard,A,24,...,38.045542,38.045542,157.540792,157.540792,257.76018,257.76018,425.918492,425.918492,88.383563,1
3,105583,111575,-589.0,17.0,5.9,-15.0,108.0,Hard,A,24,...,18.004831,18.004831,53.250338,53.250338,34.872008,34.872008,-101.321817,-101.321817,37.776118,1
4,104745,200282,8210.0,-17.0,12.7,2.0,133.0,Hard,A,24,...,-30.566756,-30.566756,-163.05341,-163.05341,-345.819036,-345.819036,-939.915025,-939.915025,-78.446485,1


In [23]:
# Columns that leak the outcome or are post-match
leak_cols = [c for c in ["MATCH_MINUTES", "DIFF_N_GAMES"] if c in full_data.columns]

# IDs that are noisy/leaky for modeling
id_like = [c for c in ["WINNER_ID", "LOSER_ID", "TOURNEY_ID"] if c in full_data.columns]

TARGET_COL = "TARGET"

# Categorical to one-hot
cat_cols = [c for c in ["SURFACE", "TOURNEY_LEVEL"] if c in full_data.columns]

drop_cols = leak_cols + id_like + [TARGET_COL]

X_raw = full_data.drop(columns=drop_cols, errors="ignore")
y = full_data[TARGET_COL].astype(int)

# One-hot encode categoricals
X = pd.get_dummies(X_raw, columns=[c for c in cat_cols if c in X_raw.columns], drop_first=True)

print("X shape:", X.shape)
X.head()


X shape: (24960, 87)


Unnamed: 0,ATP_POINT_DIFF,ATP_RANK_DIFF,AGE_DIFF,HEIGHT_DIFF,DRAW_SIZE,H2H_DIFF,H2H_SURFACE_DIFF,BEST_OF,P_ACE_LAST_3_DIFF,P_DF_LAST_3_DIFF,...,ELO_GRAD_500_DIFF,ELO_GRAD_500S_DIFF,ELO_SURFACE_DIFF,SURFACE_Grass,SURFACE_Hard,TOURNEY_LEVEL_D,TOURNEY_LEVEL_F,TOURNEY_LEVEL_G,TOURNEY_LEVEL_M,TOURNEY_LEVEL_O
0,-930.0,1.0,-0.9,3.0,24,0.0,0.0,3,,,...,0.0,0.0,0.0,False,True,False,False,False,False,False
1,1084.0,-24.0,2.2,0.0,24,0.0,0.0,3,,,...,-85.370082,-85.370082,-34.032647,False,True,False,False,False,False,False
2,3350.0,-3.0,8.7,-10.0,24,0.0,0.0,3,,,...,425.918492,425.918492,88.383563,False,True,False,False,False,False,False
3,-589.0,17.0,5.9,-15.0,24,0.0,0.0,3,,,...,-101.321817,-101.321817,37.776118,False,True,False,False,False,False,False
4,8210.0,-17.0,12.7,2.0,24,0.0,0.0,3,,,...,-939.915025,-939.915025,-78.446485,False,True,False,False,False,False,False


In [24]:
from sklearn.model_selection import train_test_split

if "TOURNEY_DATE" in full_data.columns:
    # Ensure datetime type; your data might be int yyyymmdd—adjust if needed
    try:
        dates = pd.to_datetime(full_data["TOURNEY_DATE"])
    except Exception:
        dates = pd.to_datetime(full_data["TOURNEY_DATE"], format="%Y%m%d", errors="coerce")

    # Chronological split: last 20% as validation
    order = dates.sort_values().index
    cut = int(len(order) * 0.8)
    train_idx = order[:cut]
    valid_idx = order[cut:]

    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
else:
    # Fallback to random split
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.20, random_state=42, stratify=y
    )

print("Train:", X_train.shape, " Valid:", X_valid.shape)


Train: (19968, 87)  Valid: (4992, 87)


In [25]:
import numpy as np
import xgboost as xgb

# Convert to DMatrix (older xgb likes float32)
dtrain = xgb.DMatrix(X_train.values.astype(np.float32), label=y_train.values.astype(np.float32))
dvalid = xgb.DMatrix(X_valid.values.astype(np.float32), label=y_valid.values.astype(np.float32))

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.03,              # learning rate
    "max_depth": 6,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "lambda": 1.0,            # L2 reg
    "tree_method": "hist",    # use "auto" if your version complains
    "seed": 42,
    "scale_pos_weight": float((y_train==0).sum())/max(1,(y_train==1).sum()),
}

num_boost_round = 5000
watchlist = [(dtrain, "train"), (dvalid, "valid")]

# Try early stopping; if your xgboost is too old, fall back gracefully
try:
    bst = xgb.train(
        params, dtrain, num_boost_round=num_boost_round, evals=watchlist,
        early_stopping_rounds=100, verbose_eval=50
    )
    best_ntree = getattr(bst, "best_ntree_limit", None)
    best_iter = getattr(bst, "best_iteration", None)
except TypeError:
    # No early stopping support—train fixed rounds
    bst = xgb.train(params, dtrain, num_boost_round=1500, evals=watchlist, verbose_eval=50)
    best_ntree = getattr(bst, "best_ntree_limit", None)
    best_iter = getattr(bst, "best_iteration", None)

print("Best ntree limit:", best_ntree, " Best iteration:", best_iter)


[0]	train-auc:0.72345	valid-auc:0.66936
[50]	train-auc:0.77077	valid-auc:0.70929
[100]	train-auc:0.79477	valid-auc:0.71170
[150]	train-auc:0.81861	valid-auc:0.71422
[200]	train-auc:0.84391	valid-auc:0.71593
[250]	train-auc:0.86591	valid-auc:0.71614
[300]	train-auc:0.88758	valid-auc:0.71593
[323]	train-auc:0.89622	valid-auc:0.71569
Best ntree limit: None  Best iteration: 224


In [26]:
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report

# Predict with compatibility for old/new xgb
try:
    p_valid = bst.predict(dvalid, iteration_range=(0, bst.best_iteration + 1))
except Exception:
    p_valid = bst.predict(dvalid, ntree_limit=best_ntree) if best_ntree else bst.predict(dvalid)

pred = (p_valid >= 0.5).astype(int)

auc = roc_auc_score(y_valid, p_valid)
acc = accuracy_score(y_valid, pred)

print(f"AUC: {auc:.4f} | ACC: {acc:.4f}")
print("\nConfusion matrix:\n", confusion_matrix(y_valid, pred))
print("\nClassification report:\n", classification_report(y_valid, pred, digits=3))


AUC: 0.7165 | ACC: 0.6532

Confusion matrix:
 [[1641  855]
 [ 876 1620]]

Classification report:
               precision    recall  f1-score   support

           0      0.652     0.657     0.655      2496
           1      0.655     0.649     0.652      2496

    accuracy                          0.653      4992
   macro avg      0.653     0.653     0.653      4992
weighted avg      0.653     0.653     0.653      4992



In [27]:
# Gain-based importance from the booster
score_dict = bst.get_score(importance_type="gain")  # keys like 'f0','f1',...

feat_names = X_train.columns.tolist()
mapped = []
for k, v in score_dict.items():
    try:
        idx = int(k[1:])  # 'f12' -> 12
        mapped.append((feat_names[idx], v))
    except:
        # If xgboost returns raw feature names (rare when you pass DMatrix with feature_names),
        # just keep the key
        mapped.append((k, v))

sorted_mapped = sorted(mapped, key=lambda x: x[1], reverse=True)

print("\nTop features by importance (gain):")
for name, imp in sorted_mapped[:30]:
    print(f"{name:30s}  {imp:.6f}")



Top features by importance (gain):
ATP_POINT_DIFF                  44.882282
ATP_RANK_DIFF                   24.450146
WIN_LAST_50_DIFF                19.442913
TOURNEY_LEVEL_D                 13.872601
TOURNEY_LEVEL_G                 9.966412
SURFACE_Hard                    9.483380
SURFACE_Grass                   9.292949
P_1ST_WON_LAST_10_DIFF          9.193404
ELO_GRAD_5S_DIFF                8.930059
P_2ND_WON_LAST_100_DIFF         8.815249
TOURNEY_LEVEL_F                 8.756144
ELO_GRAD_10S_DIFF               8.460623
BEST_OF                         8.212976
P_1ST_WON_LAST_2000_DIFF        8.014000
P_2ND_WON_LAST_2000_DIFF        7.971122
AGE_DIFF                        7.798765
P_1ST_IN_LAST_2000_DIFF         7.668123
P_ACE_LAST_2000_DIFF            7.615351
P_DF_LAST_2000_DIFF             7.514650
P_DF_LAST_5_DIFF                7.250978
P_1ST_WON_LAST_100_DIFF         7.246656
ELO_SURFACE_DIFF                7.134626
ELO_GRAD_10_DIFF                7.122805
P_2ND_WON_LAST_20