<a href="https://colab.research.google.com/github/jaysan22/tennis-prediction-using-random-forest/blob/main/kaggledatasetpredict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Cell 1: Setup and Manual Data Upload ---

# Install necessary libraries
!pip install xgboost

# Import libraries
import pandas as pd
import numpy as np
import os
from google.colab import files

# --- Manual File Upload ---
print("Please upload the single Kaggle CSV file (e.g., 'atp_matches_2000-2025.csv').")
uploaded = files.upload()

# The key of the 'uploaded' dictionary is the filename. We get it dynamically.
# This makes the code work even if your file has a slightly different name.
csv_path = list(uploaded.keys())[0]

print(f"\nSuccessfully uploaded '{csv_path}'. It is now ready for processing.")

Please upload the single Kaggle CSV file (e.g., 'atp_matches_2000-2025.csv').


Saving atp_tennis.csv to atp_tennis.csv

Successfully uploaded 'atp_tennis.csv'. It is now ready for processing.


In [None]:
# --- Cell 2: Data Preparation and Feature Engineering ---

# 1. Load the single CSV file
print("Loading the unified dataset...")
df_raw = pd.read_csv(csv_path, low_memory=False)

# 2. Standardize and Clean the Data
def standardize_data(df):
    print("Standardizing data...")
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df.dropna(subset=['Date', 'Surface', 'Winner', 'Player_1', 'Player_2', 'Rank_1', 'Rank_2'], inplace=True)

    # Determine winner/loser columns
    df['winner_name'] = df['Winner']
    df['loser_name'] = df.apply(lambda r: r['Player_2'] if r['Winner'] == r['Player_1'] else r['Player_1'], axis=1)
    df['winner_rank'] = df.apply(lambda r: r['Rank_1'] if r['Winner'] == r['Player_1'] else r['Rank_2'], axis=1)
    df['loser_rank'] = df.apply(lambda r: r['Rank_2'] if r['Winner'] == r['Player_1'] else r['Rank_1'], axis=1)
    df['winner_pts'] = df.apply(lambda r: r['Pts_1'] if r['Winner'] == r['Player_1'] else r['Pts_2'], axis=1)
    df['loser_pts'] = df.apply(lambda r: r['Pts_2'] if r['Winner'] == r['Player_1'] else r['Pts_1'], axis=1)

    # Use names as IDs since the Kaggle dataset doesn't have unique IDs
    df['winner_id'] = df['winner_name']
    df['loser_id'] = df['loser_name']

    # Rename and select final columns
    df = df.rename(columns={'Date': 'tourney_date', 'Surface': 'surface'})
    master_cols = ['tourney_date', 'surface', 'winner_id', 'winner_name', 'loser_id', 'loser_name',
                   'winner_rank', 'loser_rank', 'winner_pts', 'loser_pts']
    return df[master_cols].sort_values(by='tourney_date', ignore_index=True)

df = standardize_data(df_raw)

# 3. Calculate Elo Ratings
def calculate_elo(df_to_process):
    print("Calculating Elo ratings...")
    player_elos_overall, player_elos_surface = {}, {}
    elo_k = 32
    w_elos_o, l_elos_o, w_elos_s, l_elos_s = [], [], [], []
    for index, row in df_to_process.iterrows():
        winner, loser, surface = row['winner_id'], row['loser_id'], row['surface']
        if surface not in player_elos_surface: player_elos_surface[surface] = {}
        w_elo_o, l_elo_o = player_elos_overall.get(winner, 1500), player_elos_overall.get(loser, 1500)
        w_elo_s, l_elo_s = player_elos_surface[surface].get(winner, 1500), player_elos_surface[surface].get(loser, 1500)
        w_elos_o.append(w_elo_o); l_elos_o.append(l_elo_o); w_elos_s.append(w_elo_s); l_elos_s.append(l_elo_s)
        expected_win_o = 1 / (1 + 10**((l_elo_o - w_elo_o) / 400)); elo_change_o = elo_k * (1 - expected_win_o)
        player_elos_overall[winner], player_elos_overall[loser] = w_elo_o + elo_change_o, l_elo_o - elo_change_o
        expected_win_s = 1 / (1 + 10**((l_elo_s - w_elo_s) / 400)); elo_change_s = elo_k * (1 - expected_win_s)
        player_elos_surface[surface][winner], player_elos_surface[surface][loser] = w_elo_s + elo_change_s, l_elo_s - elo_change_s
    df_to_process['winner_elo'], df_to_process['loser_elo'] = w_elos_o, l_elos_o
    df_to_process['winner_elo_surface'], df_to_process['loser_elo_surface'] = w_elos_s, l_elos_s
    return df_to_process

df = calculate_elo(df)
print("Data preparation complete. The 'df' DataFrame is now ready for training.")

Loading the unified dataset...
Standardizing data...
Calculating Elo ratings...
Data preparation complete. The 'df' DataFrame is now ready for training.


In [None]:
# --- Cell 3: Model Training and Evaluation ---
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb

# 1. Restructure the data for modeling
print("Restructuring data for modeling...")
p1_cols = ['winner_id', 'winner_name', 'winner_rank', 'winner_pts', 'winner_elo', 'winner_elo_surface']
p2_cols = ['loser_id', 'loser_name', 'loser_rank', 'loser_pts', 'loser_elo', 'loser_elo_surface']

df_p1 = df[p1_cols].rename(columns=lambda x: x.replace('winner_', 'p1_'))
df_p2 = df[p2_cols].rename(columns=lambda x: x.replace('loser_', 'p2_'))

X_data = pd.concat([df_p1, df_p2], axis=1); X_data['result'] = 1
X_data_swapped = pd.concat([df_p2.rename(columns=lambda x: x.replace('p2_', 'p1_')),
                            df_p1.rename(columns=lambda x: x.replace('p1_', 'p2_'))], axis=1)
X_data_swapped['result'] = 0
model_df = pd.concat([X_data, X_data_swapped], ignore_index=True)

# 2. Create the final difference features
print("Creating difference features...")
model_df['rank_diff'] = model_df['p1_rank'] - model_df['p2_rank']
model_df['elo_diff'] = model_df['p1_elo'] - model_df['p2_elo']
model_df['elo_surface_diff'] = model_df['p1_elo_surface'] - model_df['p2_elo_surface']
model_df['pts_diff'] = model_df['p1_pts'] - model_df['p2_pts']

# 3. Define features and prepare data for training
print("Defining features and splitting data...")
features = ['rank_diff', 'elo_diff', 'elo_surface_diff', 'pts_diff']
target = 'result'
model_df_clean = model_df.dropna(subset=features)
X = model_df_clean[features]
y = model_df_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Initialize and Train the XGBoost Model
print("Training XGBoost model...")
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
    n_estimators=200, max_depth=4, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8
)
xgb_model.fit(X_train, y_train)

# 5. Evaluate the Model
print("Evaluating model...")
preds = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, preds)
print(f"\nModel Accuracy on Test Set: {accuracy * 100:.2f}%")

Restructuring data for modeling...
Creating difference features...
Defining features and splitting data...
Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating model...

Model Accuracy on Test Set: 67.13%


In [None]:
99# --- FINAL Corrected Simulation Cell ---
import numpy as np

# --- 1. Get the latest stats for all players ---
print("Creating latest stats lookup table for simulation...")

# THE FIX: We build the lookup table from 'df', NOT 'model_df'.
# 'df' is our clean, chronological DataFrame that contains all the necessary columns.
# We find the last match each player won to get their most recent stats.
latest_stats = df.sort_values('tourney_date').groupby('winner_id').last().reset_index()

# Now, rename the columns to a generic format for our function
latest_stats = latest_stats.rename(columns={
    'winner_id': 'player_id',
    'winner_name': 'player_name',
    'winner_rank': 'rank',
    'winner_pts': 'pts',
    'winner_elo': 'elo',
    'winner_elo_surface': 'elo_surface'
})

# Select only the columns we need for the final lookup table
latest_stats = latest_stats[['player_id', 'player_name', 'rank', 'pts', 'elo', 'elo_surface']]


# --- 2. The Prediction Function (Modified to include confidence) ---
def predict_match(player1_name, player2_name, model, stats_df, surface):
    p1_stats = stats_df[stats_df['player_name'] == player1_name]
    p2_stats = stats_df[stats_df['player_name'] == player2_name]
    if p1_stats.empty or p2_stats.empty:
        warning_msg = ""
        if p1_stats.empty: warning_msg += f"  (Warning: {player1_name} not in data, defaulting to opponent)"
        if p2_stats.empty: warning_msg += f"  (Warning: {player2_name} not in data, defaulting to opponent)"
        print(warning_msg)
        # Default to the player who is in the data, or player1 if neither are found (though the empty check prevents this)
        winner = player2_name if not p2_stats.empty else player1_name
        # Return default confidence of 50% if one player is missing
        return winner, 0.5

    p1_stats, p2_stats = p1_stats.iloc[0], p2_stats.iloc[0]
    rank_diff = p1_stats['rank'] - p2_stats['rank']
    elo_diff = p1_stats['elo'] - p2_stats['elo']
    elo_surface_diff = p1_stats['elo_surface'] - p2_stats['elo_surface']
    pts_diff = p1_stats['pts'] - p2_stats['pts']
    match_features = np.array([[rank_diff, elo_diff, elo_surface_diff, pts_diff]])

    # Use predict_proba to get probabilities
    probabilities = model.predict_proba(match_features)[0]
    prediction = model.predict(match_features)[0]

    # The confidence is the probability of the predicted class
    confidence = probabilities[prediction]

    winner = player1_name if prediction == 1 else player2_name
    return winner, confidence


# --- 3. Define the Tournament Draw ---
# (Your draw definition is correct)

top_half_draw = ["Alcaraz C.",
"Walton A.",
"Hanfmann Y.",
"Unknown",
"Unknown",
"Korda S.",
"Schoolkate T.",
"Moutet C.",
"Paul T.",
"Kovacevic A.",
"Tirante T.",
"Vukic A.",
"Unknown",
"Opelka R.",
"Misolic F.",
"Davidovich Fokina A.",
"Bublik A.",
"Brooksby J.",
"Ugo Carabelli C.",
"Fucsovics M.",
"Kecmanovic M.",
"Etcheverry T.",
"Unknown",
"Cobolli F.",
"Tiafoe F.",
"Unknown",
"Kypson P.",
"Comesana F.",
"Navone M.",
"Medjedovic H.",
"Berrettini M.",
"De Minaur A.",
"Zverev A.",
"Diallo G.",
"Popyrin A.",
"Muller A.",
"Nava E.",
"Jacquet K.",
"Bonzi B.",
"Norrie C.",
"Cerundolo F.",
"Zhizhen Z.",
"Unknown",
"Dzumhur D.",
"Cazaux A.",
"Unknown",
"Arnaldi M.",
"Rublev A.",
"Medvedev D.",
"De Jong J.",
"Halys Q.",
"Tabilo A.",
"Majchrzak K.",
"Fearnley J.",
"Marozsan F.",
"Rinderknech A.",
"Tien L.",
"Giron M.",
"Unknown",
"Shevchenko A.",
"Cerundolo J.",
"Thompson J.",
"Borges N.",
"Auger-Aliassime F."
]
bottom_half_draw = ["Musetti L.",
"Collignon R.",
"Sonego L.",
"Taberner C.",
"Dimitrov G.",
"Machac T.",
"Mochizuki S.",
"Tsitsipas S.",
"Lehecka J.",
"Unknown",
"Djere L.",
"Wawrinka S.",
"Kopriva V.",
"Struff J.",
"Royer V.",
"Fritz T.",
"Mensik J.",
"Carreno Busta P.",
"Unknown",
"Unknown",
"Hurkacz H.",
"Bergs Z.",
"Quinn E.",
"Griekspoor T.",
"Nakashima B.",
"Van De Zandschulp B.",
"Shang J.",
"Bautista Agut R.",
"Atmane T.",
"Unknown",
"Martinez Portero P.",
"Djokovic N.",
"Shelton B.",
"Humbert U.",
"Unknown",
"Monfils G.",
"Mannarino A.",
"Hijikata R.",
"Unknown",
"Vacherot V.",
"Shapovalov D.",
"Bu Y.",
"Altmaier D.",
"Cilic M.",
"Munar J.",
"Svrcina D.",
"Bellucci M.",
"Ruud C.",
"Khachanov K.",
"Michelsen A.",
"O'Connell C.",
"Unknown",
"Mpetshi Perricard G.",
"Baez S.",
"Garin C.",
"Darderi L.",
"Fonseca J.",
"Spizzirri E.",
"Nardi L.",
"Unknown",
"Duckworth J.",
"Unknown",
"Gaston H.",
"Sinner J."
]
ausopen_draw = top_half_draw + bottom_half_draw
print(f"Simulating a tournament with {len(ausopen_draw)} players.")

# --- 4. The Simulation Loop ---
# (This part is correct, but will now receive confidence)
current_round_players = ausopen_draw
round_num = 1

while len(current_round_players) > 1:
    round_name = f"Round {round_num}"
    if len(current_round_players) == 128: round_name = "Round 1"
    elif len(current_round_players) == 64: round_name = "Round 2"
    elif len(current_round_players) == 32: round_name = "Round 3"
    elif len(current_round_players) == 16: round_name = "Round of 16"

    elif len(current_round_players) == 8: round_name = "Quarter-Finals"
    elif len(current_round_players) == 4: round_name = "Semi-Finals"
    elif len(current_round_players) == 2: round_name = "FINAL"
    else: round_name = f"Round {round_num}"
    print(f"\n--- SIMULATING {round_name.upper()} ---")

    next_round_players = []
    for i in range(0, len(current_round_players), 2):
        player1 = current_round_players[i]
        player2 = current_round_players[i+1]

        # This call is correct, it passes 'Hard' to the function
        winner, confidence = predict_match(player1, player2, xgb_model, latest_stats, surface='Hard')
        print(f"  --> {player1} vs {player2}: Winner: {winner} (Confidence: {confidence:.2%})")
        next_round_players.append(winner)

    current_round_players = next_round_players
    round_num += 1

champion = current_round_players[0]

print("\n" + "="*40)
print(f"üèÜ PREDICTED AUSTRALIAN OPEN 2026 CHAMPION: {champion} üèÜ")
print("="*40)

Creating latest stats lookup table for simulation...
Simulating a tournament with 128 players.

--- SIMULATING ROUND 1 ---
  --> Alcaraz C. vs Walton A.: Winner: Alcaraz C. (Confidence: 97.07%)
  --> Hanfmann Y. vs Unknown: Winner: Hanfmann Y. (Confidence: 50.00%)
  --> Unknown vs Korda S.: Winner: Korda S. (Confidence: 50.00%)
  --> Schoolkate T. vs Moutet C.: Winner: Moutet C. (Confidence: 73.64%)
  --> Paul T. vs Kovacevic A.: Winner: Paul T. (Confidence: 81.78%)
  --> Tirante T. vs Vukic A.: Winner: Vukic A. (Confidence: 50.00%)
  --> Unknown vs Opelka R.: Winner: Opelka R. (Confidence: 50.00%)
  --> Misolic F. vs Davidovich Fokina A.: Winner: Davidovich Fokina A. (Confidence: 76.19%)
  --> Bublik A. vs Brooksby J.: Winner: Bublik A. (Confidence: 61.16%)
  --> Ugo Carabelli C. vs Fucsovics M.: Winner: Fucsovics M. (Confidence: 70.33%)
  --> Kecmanovic M. vs Etcheverry T.: Winner: Etcheverry T. (Confidence: 58.33%)
  --> Unknown vs Cobolli F.: Winner: Cobolli F. (Confidence: 50.00%)

In [None]:
# --- Cell 5: Simulate a Hypothetical Match ---

def simulate_hypothetical_match(player1_name, player2_name, model, stats_df, surface):
    """
    Simulates a hypothetical match between two players and predicts the winner.

    Args:
        player1_name (str): The name of the first player.
        player2_name (str): The name of the second player.
        model: The trained XGBoost model.
        stats_df (pd.DataFrame): DataFrame containing the latest player statistics.
        surface (str): The surface of the match (e.g., 'Hard', 'Clay', 'Grass').

    Returns:
        tuple: A tuple containing the predicted winner (str) and the confidence (float).
    """
    print(f"\n--- Simulating match: {player1_name} vs {player2_name} on {surface} ---")
    winner, confidence = predict_match(player1_name, player2_name, model, stats_df, surface)
    print(f"Predicted Winner: {winner} (Confidence: {confidence:.2%})")
    #return winner, confidence

# Example usage:
player1 = input("Enter the name of the first player: ")
player2 = input("Enter the name of the second player: ")
match_surface = input("Enter the surface of the match (e.g., Hard, Clay, Grass): ")

# Ensure the surface input is capitalized for consistency with the data
match_surface = match_surface.capitalize()

simulate_hypothetical_match(player1, player2, xgb_model, latest_stats, surface=match_surface)

Enter the name of the first player: Vukic A.
Enter the name of the second player: Paul T.
Enter the surface of the match (e.g., Hard, Clay, Grass): Hard

--- Simulating match: Vukic A. vs Paul T. on Hard ---
Predicted Winner: Paul T. (Confidence: 79.57%)
