In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
### Load the datasets
# i have atp datasets from 1968 to 2024
years = list(range(1968, 2025))
split_index = 39 # 39 is the index for the year 2007, which is the split point for 70/30 train/test split
training_years = years[:split_index]
testing_years = years[split_index:]

data_folder = "repos/MatchPointAI/datasets/atp_matches/"

# Get all files
all_files = sorted([f for f in os.listdir(data_folder) if f.startswith("atp_matches_") and f.endswith(".csv")])

### Manually forming the train/test split since we are combining datasets across multiple decades
## We don't want to use sklearn's train_test_split here because we want to ensure that the split is based on years, not random
# Split by year
train_files = [f for f in all_files if int(f.split('_')[-1].split('.')[0]) in training_years]
test_files = [f for f in all_files if int(f.split('_')[-1].split('.')[0]) in testing_years]

# Concatenate training data
train_df = pd.concat([pd.read_csv(os.path.join(data_folder, f)) for f in train_files], ignore_index=True)

# Concatenate testing data
test_df = pd.concat([pd.read_csv(os.path.join(data_folder, f)) for f in test_files], ignore_index=True)

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143025 entries, 0 to 143024
Data columns (total 49 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   tourney_id          143025 non-null  object 
 1   tourney_name        143025 non-null  object 
 2   surface             140088 non-null  object 
 3   draw_size           142190 non-null  float64
 4   tourney_level       143025 non-null  object 
 5   tourney_date        143025 non-null  int64  
 6   match_num           143025 non-null  int64  
 7   winner_id           143025 non-null  int64  
 8   winner_seed         50080 non-null   float64
 9   winner_entry        10425 non-null   object 
 10  winner_name         143025 non-null  object 
 11  winner_hand         143022 non-null  object 
 12  winner_ht           127126 non-null  float64
 13  winner_ioc          143017 non-null  object 
 14  winner_age          141719 non-null  float64
 15  loser_id            143025 non-nul

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51971 entries, 0 to 51970
Data columns (total 49 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_id          51971 non-null  object 
 1   tourney_name        51971 non-null  object 
 2   surface             51918 non-null  object 
 3   draw_size           51971 non-null  int64  
 4   tourney_level       51971 non-null  object 
 5   tourney_date        51971 non-null  int64  
 6   match_num           51971 non-null  int64  
 7   winner_id           51971 non-null  int64  
 8   winner_seed         22140 non-null  float64
 9   winner_entry        6795 non-null   object 
 10  winner_name         51971 non-null  object 
 11  winner_hand         51971 non-null  object 
 12  winner_ht           51240 non-null  float64
 13  winner_ioc          51971 non-null  object 
 14  winner_age          51968 non-null  float64
 15  loser_id            51971 non-null  int64  
 16  lose

In [46]:
### Feature Engineering
# creating feature dataframe using differences
def create_featureset(df):
    df_model = pd.DataFrame()

    # Feature differences (winner - loser)
    df_model['rank_diff'] = df['winner_rank'] - df['loser_rank']
    df_model['rank_points_diff'] = df['winner_rank_points'] - df['loser_rank_points']
    df_model['1stIn_diff'] = df['w_1stIn'] - df['l_1stIn']
    df_model['1stWon_diff'] = df['w_1stWon'] - df['l_1stWon']
    df_model['2ndWon_diff'] = df['w_2ndWon'] - df['l_2ndWon']
    df_model['SvGms_diff'] = df['w_SvGms'] - df['l_SvGms']
    df_model['bpSaved_diff'] = df['w_bpSaved'] - df['l_bpSaved']
    df_model['bpFaced_diff'] = df['w_bpFaced'] - df['l_bpFaced']

    # Add binary target (1 means this row represents a match where Player A won)
    df_model['target'] = 1

    # Duplicate and flip to simulate matches where Player A is the loser
    df_flipped = df_model.copy()

    # Invert all feature columns (exclude 'target')
    for col in df_flipped.columns[:-1]:
        df_flipped[col] = -df_flipped[col]

    # Set flipped target to 0 (Player A loses)
    df_flipped['target'] = 0

    # Combine both original and flipped data
    df_final = pd.concat([df_model, df_flipped], ignore_index=True)

    # Drop any rows with missing values
    df_final = df_final.dropna()

    # Separate features and labels
    X = df_final.drop(columns='target')
    y = df_final['target']
    
    return X, y


In [47]:
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# create training and testing feature sets
X_train, y_train = create_featureset(train_df)
X_test, y_test = create_featureset(test_df)

model = LogisticRegression(max_iter=85, verbose=1)
# Fit the model
try:
    model.fit(X_train, y_train)
except ConvergenceWarning:
    print("Model did not converge.")

In [50]:
predictions = model.predict(X_test)

print("Classification Report: Model Performance")
print(classification_report(y_test, predictions))

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"\nTrain: {train_score:.4f} | Test: {test_score:.4f}")

Classification Report: Model Performance
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     47796
           1       0.94      0.94      0.94     47796

    accuracy                           0.94     95592
   macro avg       0.94      0.94      0.94     95592
weighted avg       0.94      0.94      0.94     95592


Train: 0.9427 | Test: 0.9391


In [51]:
# Viewing weights of features
feature_names = X_train.columns  # Get feature names

# model.coef_ is a 2D array of shape (1, n_features) for binary classification
weights = model.coef_[0]
intercept = model.intercept_[0]

# Create a DataFrame for readability
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Weight': weights
}).sort_values(by='Weight', ascending=False)

print(coef_df)
print(f"\nIntercept (bias term): {intercept}")

            Feature    Weight
6      bpSaved_diff  1.199584
3       1stWon_diff  0.287260
4       2ndWon_diff  0.199930
1  rank_points_diff  0.000503
0         rank_diff -0.000626
2        1stIn_diff -0.098158
5        SvGms_diff -0.191388
7      bpFaced_diff -1.326278

Intercept (bias term): 4.475227223967649e-17


Feature: bpSaved_diff
    We can conclude that break points saved is the most deterministic feature for whether a player will win or lose a certain matchup.
    This is because the weight is higher on the winner's side, meaning they won more.

Feature: 1stWon_diff
    For 1st serves won, there is a lower weight on this because it is not as deterministic. Regardless, having more 1st serves is generally
    a sign for being more successful in all service games, favoring the winner.

Feature: 2ndWon_diff
    Similar to the 1st serves won, 2nd serves won tells us that these are points where the winner was able to still get the point even though
    they had faulted their first serve. This implies consistency in the gameplay of the winner.

Feature: rank_points_diff
    The feature is the difference in ATP points than the loser. The reason the weight on this is small is that it does not tell a lot about
    the current matchup, as it could be possible that someone far older (and less in shape) is up against a new generation player with less
    play time on the court.

Feature: rank_diff
    The rank of the better player is closer to 1, therefore will be lower. This is shown in the weight of the rank difference, where it is 
    negative, implying that more winners were ranked higher in ATP than the losers.

Feature: 1stIn_diff
    This feature tells how many 1st serves were made. However, because they were made does not imply they were won. This weight suggests
    that accuracy is not a good predictor of success/effective serves.

Feature: SvGms_diff
    The negative weight in this feature implies that the winners generally played less service games than the loser. This could mean that
    having more opportunities to return serve well could lead to higher success rates. However, it could be argued that having control over
    the initial ball (on YOUR serve) would be more advantageous.

Feature: bpFaced_diff
    A somewhat opposite of bpSaved_diff. This feature essentially tells that the winner faced less break points, which is correct.


The code in here is copied mostly from my mlmodels repository. Both this model and the initial model had the same accuracy percentage (94%).

They differ in datasets --
This model uses atp_matches_1968-2007 for training and 2008-2024 for testing.
The initial model uses atp_matches_2024, with a 70/30 split.

This essentially deducts that the distribution of data within the sets are extremely similar, given that the model is predicting at the same accuracy rate regardless of
the temporal aspect of the given datasets.

In [52]:
### Review of Model and Feature Setup ###


The features used in this model are NOT predictive of win or loss. The reason for this is that many of the features are attributes of the match itself, such as break points faced, service games, etc. Different datasets and such need to be used to do so.