In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.model_selection import KFold
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree, DecisionTreeRegressor
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import statsmodels.formula.api as smf
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix)
import statsmodels.formula.api as smf
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import statsmodels.api as sm

In [2]:
#import dataset 
df = pd.read_csv('tennis.csv')
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player1_id,player1_seed,player1_entry,player1_name,player1_hand,player1_ht,player1_ioc,player1_age,player2_id,player2_seed,player2_entry,player2_name,player2_hand,player2_ht,player2_ioc,player2_age,best_of,round,duration,player1_rank,player2_rank,age_diff,age_avg,rank_diff,rank_avg,ht_diff,seed_matchup,entry_matchup
0,1975-2023,Fairfield,Carpet,32,A,19750224,29,100058,,Regular,Roger Taylor,L,183.0,GBR,33.3,100272,,Regular,Sandy Mayer,R,178.0,USA,22.8,3,F,135.0,41.0,55.0,10.5,28.05,14.0,48.0,5.0,Both Unseeded,Both Regular
1,1987-414,Hamburg,Clay,56,A,19870427,55,101036,2.0,Regular,Miloslav Mecir,R,190.0,SVK,22.9,100656,1.0,Regular,Ivan Lendl,R,188.0,USA,27.1,5,F,136.0,1.0,5.0,4.2,25.0,4.0,3.0,2.0,Both Seeded,Both Regular
2,1991-339,Adelaide,Hard,32,A,19901231,14,101441,,Regular,Martin Sinner,R,180.0,GER,22.8,101117,,Regular,Eric Jelen,R,180.0,GER,25.8,3,R32,89.0,111.0,68.0,3.0,24.3,43.0,89.5,0.0,Both Unseeded,Both Regular
3,1991-339,Adelaide,Hard,32,A,19901231,13,101440,7.0,Regular,Mark Koevermans,R,185.0,NED,22.9,101185,,WC,Mark Woodforde,L,188.0,AUS,25.2,3,R32,99.0,101.0,48.0,2.3,24.05,53.0,74.5,3.0,Seeded vs Unseeded,Regular vs Special
4,1991-339,Adelaide,Hard,32,A,19901231,12,101061,,Regular,Jimmy Arias,R,175.0,USA,26.3,101511,3.0,Regular,Horst Skoff,R,175.0,AUT,22.3,3,R32,80.0,60.0,26.0,4.0,24.3,34.0,43.0,0.0,Seeded vs Unseeded,Both Regular


In [3]:
# define function
def check_overrun(row):
    if row['best_of'] == 3:
        scheduled = 100
    elif row['best_of'] == 5:
        scheduled = 160
    return "Yes" if row['duration'] > scheduled else "No"

# Create new column
df['Overrunning'] = df.apply(check_overrun, axis=1)


In [4]:
# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['surface','tourney_level','seed_matchup','entry_matchup'], drop_first=True)
df_encoded = df_encoded.dropna(subset=['Overrunning'])
df_encoded.isnull().sum()

# check
print(df[['best_of', 'duration', 'Overrunning']].head(30))
print(df['Overrunning'].value_counts())
df_encoded

    best_of  duration Overrunning
0         3     135.0         Yes
1         5     136.0          No
2         3      89.0          No
3         3      99.0          No
4         3      80.0          No
5         3     181.0         Yes
6         3      67.0          No
7         3      90.0          No
8         3      88.0          No
9         3      89.0          No
10        3      85.0          No
11        3      71.0          No
12        3     119.0         Yes
13        3      63.0          No
14        3     113.0         Yes
15        3      90.0          No
16        3      80.0          No
17        3     130.0         Yes
18        3      55.0          No
19        3     152.0         Yes
20        3      81.0          No
21        3     119.0         Yes
22        3      69.0          No
23        3      80.0          No
24        3      95.0          No
25        3     108.0         Yes
26        3      89.0          No
27        3     109.0         Yes
28        3   

Unnamed: 0,tourney_id,tourney_name,draw_size,tourney_date,match_num,player1_id,player1_seed,player1_entry,player1_name,player1_hand,player1_ht,player1_ioc,player1_age,player2_id,player2_seed,player2_entry,player2_name,player2_hand,player2_ht,player2_ioc,player2_age,best_of,round,duration,player1_rank,player2_rank,age_diff,age_avg,rank_diff,rank_avg,ht_diff,Overrunning,surface_Clay,surface_Grass,surface_Hard,tourney_level_D,tourney_level_F,tourney_level_G,tourney_level_M,seed_matchup_Both Unseeded,seed_matchup_Seeded vs Unseeded,entry_matchup_Both Special,entry_matchup_Regular vs Special
0,1975-2023,Fairfield,32,19750224,29,100058,,Regular,Roger Taylor,L,183.0,GBR,33.3,100272,,Regular,Sandy Mayer,R,178.0,USA,22.8,3,F,135.0,41.0,55.0,10.5,28.05,14.0,48.0,5.0,Yes,False,False,False,False,False,False,False,True,False,False,False
1,1987-414,Hamburg,56,19870427,55,101036,2.0,Regular,Miloslav Mecir,R,190.0,SVK,22.9,100656,1.0,Regular,Ivan Lendl,R,188.0,USA,27.1,5,F,136.0,1.0,5.0,4.2,25.00,4.0,3.0,2.0,No,True,False,False,False,False,False,False,False,False,False,False
2,1991-339,Adelaide,32,19901231,14,101441,,Regular,Martin Sinner,R,180.0,GER,22.8,101117,,Regular,Eric Jelen,R,180.0,GER,25.8,3,R32,89.0,111.0,68.0,3.0,24.30,43.0,89.5,0.0,No,False,False,True,False,False,False,False,True,False,False,False
3,1991-339,Adelaide,32,19901231,13,101440,7.0,Regular,Mark Koevermans,R,185.0,NED,22.9,101185,,WC,Mark Woodforde,L,188.0,AUS,25.2,3,R32,99.0,101.0,48.0,2.3,24.05,53.0,74.5,3.0,No,False,False,True,False,False,False,False,False,True,False,True
4,1991-339,Adelaide,32,19901231,12,101061,,Regular,Jimmy Arias,R,175.0,USA,26.3,101511,3.0,Regular,Horst Skoff,R,175.0,AUT,22.3,3,R32,80.0,60.0,26.0,4.0,24.30,34.0,43.0,0.0,No,False,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86198,2022-0605,Tour Finals,8,20221114,290,126774,2.0,Regular,Stefanos Tsitsipas,R,193.0,GRE,24.2,126094,6.0,Regular,Andrey Rublev,R,188.0,RUS,25.0,3,RR,102.0,7.0,3.0,0.8,24.60,4.0,5.0,5.0,Yes,False,False,True,False,True,False,False,False,False,False,False
86199,2022-0605,Tour Finals,8,20221114,289,126774,2.0,Regular,Stefanos Tsitsipas,R,193.0,GRE,24.2,104925,7.0,Regular,Novak Djokovic,R,188.0,SRB,35.4,3,RR,98.0,8.0,3.0,11.2,29.80,5.0,5.5,5.0,No,False,False,True,False,True,False,False,False,False,False,False
86200,2022-0605,Tour Finals,8,20221114,287,106421,4.0,Regular,Daniil Medvedev,R,198.0,RUS,26.7,104925,7.0,Regular,Novak Djokovic,R,188.0,SRB,35.4,3,RR,191.0,8.0,5.0,8.7,31.05,3.0,6.5,10.0,Yes,False,False,True,False,True,False,False,False,False,False,False
86201,2022-0605,Tour Finals,8,20221114,286,104925,7.0,Regular,Novak Djokovic,R,188.0,SRB,35.4,126094,6.0,Regular,Andrey Rublev,R,188.0,RUS,25.0,3,RR,67.0,8.0,7.0,10.4,30.20,1.0,7.5,0.0,No,False,False,True,False,True,False,False,False,False,False,False


## Performing Random Over Sampling to Address Class Imbalance 

In [5]:
# Change column names
df_encoded.columns = df_encoded.columns.str.replace(" ", "_")

# Define features 
features = [
    "seed_matchup_Both_Unseeded",
    "seed_matchup_Seeded_vs_Unseeded",
    "entry_matchup_Both_Special",
    "entry_matchup_Regular_vs_Special",
    "surface_Clay",
    "surface_Grass",
    "surface_Hard",
    "tourney_level_D",
    "tourney_level_F",
    "tourney_level_G",
    "tourney_level_M",
    "age_diff",
    "rank_diff",
    "best_of"]

# Drop NA Values in dataframe
df_encoded = df_encoded.dropna(subset=['Overrunning'] + features)

# Convert target to numeric 
df_encoded['Overrunning'] = df_encoded['Overrunning'].map({'Yes': 1, 'No': 0}).astype(int)

# Inspect reamining amount of rows 
print("Remaining rows:", df_encoded.shape[0])

# Inspect class distribution in entire dataset 
print("Class distribution:\n", df_encoded['Overrunning'].value_counts())

# Define X and y
X = df_encoded[features]
y = df_encoded['Overrunning']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9009, stratify=y)

# Inspect class distribution in train set 
print(f"Original training class distribution: {Counter(y_train)}")

# Perform Oversampling on training set 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Inspect class distribution in train set after oversampling
print(f"Resampled training class distribution: {Counter(y_resampled)}")

Remaining rows: 86203
Class distribution:
 Overrunning
0    55081
1    31122
Name: count, dtype: int64
Original training class distribution: Counter({0: 38557, 1: 21785})
Resampled training class distribution: Counter({0: 38557, 1: 38557})


## Logistic Regression Model

In [6]:
# Train logistic regression model 
X_resampled_sm = sm.add_constant(X_resampled).astype(float)
X_test_sm = sm.add_constant(X_test).astype(float)
logit_model = sm.Logit(y_resampled, X_resampled_sm).fit(maxiter=200)

# Summary 
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.684207
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:            Overrunning   No. Observations:                77114
Model:                          Logit   Df Residuals:                    77099
Method:                           MLE   Df Model:                           14
Date:                Sun, 24 Aug 2025   Pseudo R-squ.:                 0.01290
Time:                        02:05:42   Log-Likelihood:                -52762.
converged:                       True   LL-Null:                       -53451.
Covariance Type:            nonrobust   LLR p-value:                6.050e-286
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const                                0.8604      0.171      5.03

### Removing Insignificant Features 

In [7]:
# Define features 
features2 = [
    "seed_matchup_Both_Unseeded",
    "seed_matchup_Seeded_vs_Unseeded",
    "entry_matchup_Both_Special",
    "entry_matchup_Regular_vs_Special",
    "surface_Clay",
    "surface_Hard",
    "tourney_level_D",
    "tourney_level_F",
    "tourney_level_M",
    "age_diff",
    "rank_diff",
    "best_of"]

# Define X and y
X = df_encoded[features2]
y = df_encoded['Overrunning']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9009, stratify=y)

# Inspect class distribution in train set 
print(f"Original training class distribution: {Counter(y_train)}")

# Perform Oversampling on training set 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Inspect class distribution in train set after oversampling
print(f"Resampled training class distribution: {Counter(y_resampled)}")

# Train logistic regression model 
X_resampled_sm = sm.add_constant(X_resampled).astype(float)
X_test_sm = sm.add_constant(X_test).astype(float)
logit_model = sm.Logit(y_resampled, X_resampled_sm).fit(maxiter=200)

# Summary 
print(logit_model.summary())

# Prediction 
y_prob = logit_model.predict(X_test_sm)
y_pred = (y_prob >= 0.5).astype(int)

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Original training class distribution: Counter({0: 38557, 1: 21785})
Resampled training class distribution: Counter({0: 38557, 1: 38557})
Optimization terminated successfully.
         Current function value: 0.684215
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:            Overrunning   No. Observations:                77114
Model:                          Logit   Df Residuals:                    77101
Method:                           MLE   Df Model:                           12
Date:                Sun, 24 Aug 2025   Pseudo R-squ.:                 0.01289
Time:                        02:05:43   Log-Likelihood:                -52763.
converged:                       True   LL-Null:                       -53451.
Covariance Type:            nonrobust   LLR p-value:                9.387e-288
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------

## Decision Tree 

In [8]:
# Train Decision Tree 
dt_model = DecisionTreeClassifier(
    criterion="gini",        
    random_state=42)

# fit model 
dt_model.fit(X_resampled, y_resampled)

# predict 
y_pred = dt_model.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))



Confusion Matrix:
 [[10724  5800]
 [ 5811  3526]]
Accuracy: 0.5510227756080585
Precision: 0.37808277932661377
Recall: 0.37763735675270427
F1 Score: 0.37785993677329477


### Implementing Grid Search for best Parameters

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7, 10],   
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01]}

grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42, criterion="gini"),
    param_grid,
    scoring='accuracy', 
    cv=5)

grid_search.fit(X_resampled, y_resampled)

print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Params: {'ccp_alpha': 0.0, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score: 0.5520788480620504


In [10]:
# Train Decision Tree with best params
dt_best = DecisionTreeClassifier(
    criterion="gini",        
    max_depth=10,
    min_samples_leaf=1,
    min_samples_split=2,
    ccp_alpha=0,
    random_state=42)

# Fit model on oversampled training data
dt_best.fit(X_resampled, y_resampled)

# Predict on original test set
y_pred = dt_best.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))



Confusion Matrix:
 [[6907 9617]
 [3061 6276]]
Accuracy: 0.5097637369011252
Precision: 0.39489083244195555
Recall: 0.6721645068008997
F1 Score: 0.49750297265160526


## KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

# Perform Scaling
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled) 
X_test_scaled = scaler.transform(X_test)               

# Train KNN
knn = KNeighborsClassifier(n_neighbors=5) 
knn.fit(X_resampled_scaled, y_resampled)

# Predict
y_pred = knn.predict(X_test_scaled)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[9338 7186]
 [4964 4373]]
Accuracy: 0.5301805807973397
Precision: 0.3783199238688468
Recall: 0.46835171896754846
F1 Score: 0.4185490045941807


### Finding best K value 

In [None]:
from sklearn.model_selection import cross_val_score

# Range of K values to test
k_values = range(1, 31)
cv_scores = []

# Perform cross-validation for each K
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_resampled, y_resampled, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Find best K
best_k = k_values[np.argmax(cv_scores)]
best_score = max(cv_scores)

# Plot results
plt.figure(figsize=(8, 5))
plt.plot(k_values, cv_scores, marker='o')
plt.xlabel("K Values")
plt.ylabel("Cross-Validation Accuracy")
plt.title("K vs Accuracy for KNN")
plt.xticks(k_values)
plt.grid(True)
plt.show()


In [None]:
# Range of K values to test
k_values = range(1, 31)
cv_scores = []
error_rates = []

# Perform cross-validation for each K
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_resampled, y_resampled, cv=5, scoring='accuracy')
    mean_score = scores.mean()
    cv_scores.append(mean_score)
    error_rates.append(1 - mean_score) 

# Find best K
best_k = k_values[np.argmax(cv_scores)]
best_score = max(cv_scores)

# Plot Accuracy and Error Rate
plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_scores, marker='o', label="Accuracy")
plt.plot(k_values, error_rates, marker='x', label="Error Rate", linestyle="--", color="red")
plt.xlabel("K Values")
plt.ylabel("Score")
plt.title("K vs Accuracy & Error Rate for KNN")
plt.xticks(k_values)
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Perform Scaling
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled) 
X_test_scaled = scaler.transform(X_test)               

# Train KNN
knn = KNeighborsClassifier(n_neighbors=6) 
knn.fit(X_resampled_scaled, y_resampled)

# Predict
y_pred = knn.predict(X_test_scaled)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))