In [1]:
pip install seaborn scikit-learn matplotlib numpy pandas seaborn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.3.6/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [18]:
df = pd.read_csv('atp_matches_2024.csv')

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3076 entries, 0 to 3075
Data columns (total 49 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_id          3076 non-null   object 
 1   tourney_name        3076 non-null   object 
 2   surface             3076 non-null   object 
 3   draw_size           3076 non-null   int64  
 4   tourney_level       3076 non-null   object 
 5   tourney_date        3076 non-null   int64  
 6   match_num           3076 non-null   int64  
 7   winner_id           3076 non-null   int64  
 8   winner_seed         1294 non-null   float64
 9   winner_entry        477 non-null    object 
 10  winner_name         3076 non-null   object 
 11  winner_hand         3076 non-null   object 
 12  winner_ht           3057 non-null   float64
 13  winner_ioc          3076 non-null   object 
 14  winner_age          3075 non-null   float64
 15  loser_id            3076 non-null   int64  
 16  loser_

In [22]:
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2024-0339,Brisbane,Hard,32,A,20240101,300,105777,2.0,,...,58.0,44.0,16.0,11.0,8.0,9.0,14.0,2570.0,8.0,3660.0
1,2024-0339,Brisbane,Hard,32,A,20240101,299,208029,1.0,,...,35.0,31.0,10.0,11.0,5.0,7.0,8.0,3660.0,39.0,1122.0
2,2024-0339,Brisbane,Hard,32,A,20240101,298,105777,2.0,,...,39.0,24.0,14.0,10.0,5.0,7.0,14.0,2570.0,55.0,902.0
3,2024-0339,Brisbane,Hard,32,A,20240101,297,208029,1.0,,...,51.0,31.0,16.0,10.0,3.0,5.0,8.0,3660.0,116.0,573.0
4,2024-0339,Brisbane,Hard,32,A,20240101,296,126128,,,...,37.0,27.0,16.0,10.0,5.0,8.0,39.0,1122.0,44.0,1021.0


In [24]:
# Create feature dataframe from differences
df_model = pd.DataFrame()

# Feature differences (winner - loser)
df_model['rank_diff'] = df['winner_rank'] - df['loser_rank']
df_model['rank_points_diff'] = df['winner_rank_points'] - df['loser_rank_points']
df_model['1stIn_diff'] = df['w_1stIn'] - df['l_1stIn']
df_model['1stWon_diff'] = df['w_1stWon'] - df['l_1stWon']
df_model['2ndWon_diff'] = df['w_2ndWon'] - df['l_2ndWon']
df_model['SvGms_diff'] = df['w_SvGms'] - df['l_SvGms']
df_model['bpSaved_diff'] = df['w_bpSaved'] - df['l_bpSaved']
df_model['bpFaced_diff'] = df['w_bpFaced'] - df['l_bpFaced']

# Add binary target (1 means this row represents a match where Player A won)
df_model['target'] = 1

In [25]:
# Duplicate and flip to simulate matches where Player A is the loser
df_flipped = df_model.copy()

# Invert all feature columns (exclude 'target')
for col in df_flipped.columns[:-1]:
    df_flipped[col] = -df_flipped[col]

# Set flipped target to 0 (Player A loses)
df_flipped['target'] = 0

In [26]:
# Combine both original and flipped data
df_final = pd.concat([df_model, df_flipped], ignore_index=True)

# Drop any rows with missing values
df_final = df_final.dropna()

# Separate features and labels
X = df_final.drop(columns='target')
y = df_final['target']


In [43]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)

# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [48]:
# Predict and evaluate performance
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print("Model Performance:\n")
print(classification_report(y_test, y_pred))

Model Performance:

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       904
           1       0.94      0.94      0.94       879

    accuracy                           0.94      1783
   macro avg       0.94      0.94      0.94      1783
weighted avg       0.94      0.94      0.94      1783



In [51]:
# create function for prediction

def predict_winner(player_a_stats: dict, player_b_stats: dict, model):
    """
    Predicts the winner between Player A and Player B using the trained model.

    Each player’s stats should be provided as a dictionary, with keys:
    'rank', 'rank_points', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced'
    """
    # Calculate feature differences (Player A - Player B)
    feature_vector = pd.DataFrame([{
        'rank_diff': player_a_stats['rank'] - player_b_stats['rank'],
        'rank_points_diff': player_a_stats['rank_points'] - player_b_stats['rank_points'],
        '1stIn_diff': player_a_stats['1stIn'] - player_b_stats['1stIn'],
        '1stWon_diff': player_a_stats['1stWon'] - player_b_stats['1stWon'],
        '2ndWon_diff': player_a_stats['2ndWon'] - player_b_stats['2ndWon'],
        'SvGms_diff': player_a_stats['SvGms'] - player_b_stats['SvGms'],
        'bpSaved_diff': player_a_stats['bpSaved'] - player_b_stats['bpSaved'],
        'bpFaced_diff': player_a_stats['bpFaced'] - player_b_stats['bpFaced'],
    }])

    # Predict
    prediction = model.predict(feature_vector)[0]
    probability = model.predict_proba(feature_vector)[0][prediction]

    # Interpret prediction
    if prediction == 1:
        return f"Player A is likely to win with {probability*100:.1f}% confidence"
    else:
        return f"Player B is likely to win with {probability*100:.1f}% confidence"


In [53]:
# example data

# Player A fake stats
a = {
    'rank': 14,
    'rank_points': 2570,
    '1stIn': 58,
    '1stWon': 44,
    '2ndWon': 16,
    'SvGms': 11,
    'bpSaved': 8,
    'bpFaced': 9
}

# Player B fake stats
b = {
    'rank': 8,
    'rank_points': 3660,
    '1stIn': 60,
    '1stWon': 46,
    '2ndWon': 15,
    'SvGms': 11,
    'bpSaved': 7,
    'bpFaced': 8
}

# Predict
print(predict_winner(a, b, model))


🎾 Player B is likely to win with 62.1% confidence


In [54]:
## now need to implement name searching for basic logistic regression model (going to eventually use XGBoost for project)