# Models and hypotheses

In [38]:
# Core libraries
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Model selection and tuning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, roc_curve, precision_recall_curve,
    confusion_matrix, mean_squared_error
)

# Machine learning libraries
from catboost import CatBoostClassifier
import catboost as cb
import xgboost as xgb

# Statistical tools
from scipy.stats import ks_2samp

print("Libraries imported successfully")

df = pd.read_csv('tennis_data_cleaned.csv')
df['Year'] = pd.to_datetime(df['Date']).dt.year

Libraries imported successfully


In [39]:
df_new = pd.read_csv('tennis_data_cleaned.csv')
print("=== Missing values in the data ===")
missing_values = df_new.isnull().sum()
missing_percentage = (missing_values / len(df_new)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print(missing_df[missing_df['Missing Values'] > 0])

=== Missing values in the data ===
    Missing Values  Percentage
W3           18994   52.902184
L3           18994   52.902184
W4           32480   90.463458
L4           32480   90.463458
W5           34615   96.409871
L5           34615   96.409871


## Models

### Lasso

In [26]:
# Data preparation
df_ml = df.reset_index(drop=True)

np.random.seed(42)
mask = np.random.random(len(df_ml)) > 0.5
df_ml['player_1_height'] = np.where(mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_2_height'] = np.where(~mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_1_weight'] = np.where(mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_2_weight'] = np.where(~mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_1_hand'] = np.where(mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_2_hand'] = np.where(~mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_1_flag'] = np.where(mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_2_flag'] = np.where(~mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_1_year_pro'] = np.where(mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_2_year_pro'] = np.where(~mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_1_rank'] = np.where(mask, df_ml['WRank'], df_ml['LRank'])
df_ml['player_2_rank'] = np.where(~mask, df_ml['WRank'], df_ml['LRank'])
df_ml['Winner_Indicator'] = np.where(mask, 1, 0)

# Feature engineering
df_ml['height_diff'] = df_ml['player_1_height'] - df_ml['player_2_height']
df_ml['weight_diff'] = df_ml['player_1_weight'] - df_ml['player_2_weight']
df_ml['rank_diff'] = df_ml['player_1_rank'] - df_ml['player_2_rank']
df_ml['year_pro_diff'] = df_ml['player_1_year_pro'] - df_ml['player_2_year_pro']
df_ml['height_weight_interaction'] = df_ml['height_diff'] * df_ml['weight_diff']
df_ml['best_of'] = df_ml['Best of'].fillna(df_ml['Best of'].median())
df_ml['year'] = df_ml['Year'].fillna(df_ml['Year'].median())
surface_dummies = pd.get_dummies(df_ml['Surface'].astype(str).fillna('Unknown'), prefix='surface')
court_dummies = pd.get_dummies(df_ml['Court'].astype(str).fillna('Unknown'), prefix='court')
df_ml = pd.concat([df_ml, surface_dummies, court_dummies], axis=1)
for col in surface_dummies.columns:
    df_ml[f'rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]
for col in court_dummies.columns:
    df_ml[f'rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]

le = LabelEncoder()
for col in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']:
    df_ml[col] = le.fit_transform(df_ml[col].astype(str).fillna('Unknown'))

# Polynomial features with unique names
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly_features = poly.fit_transform(df_ml[['height_diff', 'weight_diff', 'rank_diff']])
poly_feature_names = poly.get_feature_names_out(['height_diff', 'weight_diff', 'rank_diff'])
poly_columns = [f'poly_{name.replace(" ", "_")}' for name in poly_feature_names]
if len(poly_columns) != len(set(poly_columns)):
    raise ValueError(f"Duplicates found in polynomial feature names: {poly_columns}")
df_poly = pd.DataFrame(poly_features, columns=poly_columns, index=df_ml.index)
df_ml = pd.concat([df_ml, df_poly], axis=1)

# Forming feature list
base_features = ['year_pro_diff', 'best_of', 'year', 'player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court', 'height_weight_interaction']
interaction_features = [col for col in df_ml.columns if 'rank_' in col and 'interaction' in col]
poly_features = list(poly_columns)
features = base_features + interaction_features + poly_features
if len(features) != len(set(features)):
    raise ValueError(f"Duplicates found in feature list: {features}")
df_ml_features = df_ml[features + ['Winner_Indicator', 'AvgW', 'LRank']].dropna()

# Model training
X = df_ml_features.drop(columns=['Winner_Indicator', 'AvgW', 'LRank'])
y = df_ml_features['Winner_Indicator']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
numeric_cols = [col for col in features if col not in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']]
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1'], 'solver': ['liblinear']}
lasso_model = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), param_grid, cv=5, scoring='accuracy')
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
y_pred_prob_lasso = lasso_model.predict_proba(X_test)[:, 1]

# Calibration
lasso_calibrated = CalibratedClassifierCV(lasso_model.best_estimator_, method='isotonic', cv=5)
lasso_calibrated.fit(X_train, y_train)
y_pred_prob_lasso_cal = lasso_calibrated.predict_proba(X_test)[:, 1]

# Metrics
accuracy_lasso = accuracy_score(y_test, y_pred_lasso)
precision_lasso = precision_score(y_test, y_pred_lasso)
recall_lasso = recall_score(y_test, y_pred_lasso)
f1_lasso = f1_score(y_test, y_pred_lasso)
auc_lasso = roc_auc_score(y_test, y_pred_prob_lasso_cal)

print("Best Parameters (Lasso):", lasso_model.best_params_)
print("Model Evaluation (Lasso):")
print(f"Accuracy: {accuracy_lasso:.2f}")
print(f"Precision: {precision_lasso:.2f}")
print(f"Recall: {recall_lasso:.2f}")
print(f"F1-Score: {f1_lasso:.2f}")
print(f"AUC: {auc_lasso:.2f}")
print("\nFull Classification Report:")
print(classification_report(y_test, y_pred_lasso, target_names=['Player 2 Wins', 'Player 1 Wins']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': np.abs(lasso_model.best_estimator_.coef_[0])
})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("\nFeature Importance (Lasso):")
print(feature_importance.head(10))

# Visualizing feature importance
fig_importance = px.bar(feature_importance.head(10), x='Importance', y='Feature', orientation='h',
                        title='Top 10 Feature Importance (Lasso)', color='Importance', color_continuous_scale='Blues')
fig_importance.update_layout(template='plotly_dark')
fig_importance.show()

# Plots
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_lasso_cal)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_lasso_cal)
cm = confusion_matrix(y_test, y_pred_lasso)

# ROC Curve
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC = {auc_lasso:.2f}', line=dict(color='lightblue')))
fig1.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash', color='white'), name='Random'))
fig1.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', template='plotly_dark')
fig1.show()

# Precision-Recall Curve
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall Curve', line=dict(color='lightblue')))
fig2.update_layout(title='Precision-Recall Curve', xaxis_title='Recall', yaxis_title='Precision', template='plotly_dark')
fig2.show()

# Confusion Matrix
fig3 = px.imshow(cm, text_auto=True, aspect="auto", labels=dict(x="Predicted", y="True", color="Count"),
                 x=['Player 2 Wins', 'Player 1 Wins'], y=['Player 2 Wins', 'Player 1 Wins'], color_continuous_scale='Blues')
fig3.update_layout(title='Confusion Matrix', template='plotly_dark')
fig3.show()

Best Parameters (Lasso): {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}
Model Evaluation (Lasso):
Accuracy: 0.65
Precision: 0.64
Recall: 0.67
F1-Score: 0.66
AUC: 0.69

Full Classification Report:
               precision    recall  f1-score   support

Player 2 Wins       0.66      0.62      0.64      3600
Player 1 Wins       0.64      0.67      0.66      3581

     accuracy                           0.65      7181
    macro avg       0.65      0.65      0.65      7181
 weighted avg       0.65      0.65      0.65      7181


Feature Importance (Lasso):
                           Feature  Importance
19                poly_weight_diff    0.011382
20                  poly_rank_diff    0.004144
15   rank_surface_Hard_interaction    0.001969
17  rank_court_Outdoor_interaction    0.000909
18                poly_height_diff    0.000797
13   rank_surface_Clay_interaction    0.000573
9                    player_2_flag    0.000296
25      poly_weight_diff_rank_diff    0.000035
0            

### XGBoost

In [27]:
# Data preparation
df_ml = df.reset_index(drop=True)
np.random.seed(42)
mask = np.random.random(len(df_ml)) > 0.5
df_ml['player_1_height'] = np.where(mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_2_height'] = np.where(~mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_1_weight'] = np.where(mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_2_weight'] = np.where(~mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_1_hand'] = np.where(mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_2_hand'] = np.where(~mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_1_flag'] = np.where(mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_2_flag'] = np.where(~mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_1_year_pro'] = np.where(mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_2_year_pro'] = np.where(~mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_1_rank'] = np.where(mask, df_ml['WRank'], df_ml['LRank'])
df_ml['player_2_rank'] = np.where(~mask, df_ml['WRank'], df_ml['LRank'])
df_ml['Winner_Indicator'] = np.where(mask, 1, 0)

# Feature engineering
df_ml['height_diff'] = df_ml['player_1_height'] - df_ml['player_2_height']
df_ml['weight_diff'] = df_ml['player_1_weight'] - df_ml['player_2_weight']
df_ml['rank_diff'] = df_ml['player_1_rank'] - df_ml['player_2_rank']
df_ml['year_pro_diff'] = df_ml['player_1_year_pro'] - df_ml['player_2_year_pro']
df_ml['height_weight_interaction'] = df_ml['height_diff'] * df_ml['weight_diff']
df_ml['best_of'] = df_ml['Best of'].fillna(df_ml['Best of'].median())
df_ml['year'] = df_ml['Year'].fillna(df_ml['Year'].median())
surface_dummies = pd.get_dummies(df_ml['Surface'].astype(str).fillna('Unknown'), prefix='surface')
court_dummies = pd.get_dummies(df_ml['Court'].astype(str).fillna('Unknown'), prefix='court')
df_ml = pd.concat([df_ml, surface_dummies, court_dummies], axis=1)
for col in surface_dummies.columns:
    df_ml[f'rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]
for col in court_dummies.columns:
    df_ml[f'rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]

le = LabelEncoder()
for col in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']:
    df_ml[col] = le.fit_transform(df_ml[col].astype(str).fillna('Unknown'))

# Polynomial features with unique names
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly_features = poly.fit_transform(df_ml[['height_diff', 'weight_diff', 'rank_diff']])
poly_feature_names = poly.get_feature_names_out(['height_diff', 'weight_diff', 'rank_diff'])
poly_columns = [f'poly_{name.replace(" ", "_")}' for name in poly_feature_names]
if len(poly_columns) != len(set(poly_columns)):
    raise ValueError(f"Duplicates found in polynomial feature names: {poly_columns}")
df_poly = pd.DataFrame(poly_features, columns=poly_columns, index=df_ml.index)
df_ml = pd.concat([df_ml, df_poly], axis=1)

# Forming feature list
base_features = ['year_pro_diff', 'best_of', 'year', 'player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court', 'height_weight_interaction']
interaction_features = [col for col in df_ml.columns if 'rank_' in col and 'interaction' in col]
poly_features = list(poly_columns)
features = base_features + interaction_features + poly_features
if len(features) != len(set(features)):
    raise ValueError(f"Duplicates found in feature list: {features}")
df_ml_features = df_ml[features + ['Winner_Indicator', 'AvgW', 'LRank']].dropna()

# Model training
X = df_ml_features.drop(columns=['Winner_Indicator', 'AvgW', 'LRank'])
y = df_ml_features['Winner_Indicator']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
numeric_cols = [col for col in features if col not in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']]
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Converting to numpy array
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [class_weights[1]/class_weights[0]]
}
xgb_model = RandomizedSearchCV(xgb.XGBClassifier(random_state=42, eval_metric='logloss'), param_dist, n_iter=10, cv=5, scoring='roc_auc', random_state=42, error_score='raise')
xgb_model.fit(X_train_np, y_train)
y_pred_xgb = xgb_model.predict(X_test_np)
y_pred_prob_xgb = xgb_model.predict_proba(X_test_np)[:, 1]

# Calibration
xgb_calibrated = CalibratedClassifierCV(xgb_model.best_estimator_, method='isotonic', cv=5)
xgb_calibrated.fit(X_train_np, y_train)
y_pred_prob_xgb_cal = xgb_calibrated.predict_proba(X_test_np)[:, 1]

# Metrics
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb_cal)

print("Best Parameters (XGBoost):", xgb_model.best_params_)
print("Model Evaluation (XGBoost):")
print(f"Accuracy: {accuracy_xgb:.2f}")
print(f"Precision: {precision_xgb:.2f}")
print(f"Recall: {recall_xgb:.2f}")
print(f"F1-Score: {f1_xgb:.2f}")
print(f"AUC: {auc_xgb:.2f}")
print("\nFull Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Player 2 Wins', 'Player 1 Wins']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': xgb_model.best_estimator_.feature_importances_
})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("\nFeature Importance (XGBoost):")
print(feature_importance.head(10))

# Visualizing feature importance
fig_importance = px.bar(feature_importance.head(10), x='Importance', y='Feature', orientation='h',
                        title='Top 10 Feature Importance (XGBoost)', color='Importance', color_continuous_scale='Blues')
fig_importance.update_layout(template='plotly_dark')
fig_importance.show()

# Plots
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_xgb_cal)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_xgb_cal)
cm = confusion_matrix(y_test, y_pred_xgb)

# ROC Curve
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC = {auc_xgb:.2f}', line=dict(color='lightblue')))
fig1.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash', color='white'), name='Random'))
fig1.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', template='plotly_dark')
fig1.show()

# Precision-Recall Curve
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall Curve', line=dict(color='lightblue')))
fig2.update_layout(title='Precision-Recall Curve', xaxis_title='Recall', yaxis_title='Precision', template='plotly_dark')
fig2.show()

# Confusion Matrix
fig3 = px.imshow(cm, text_auto=True, aspect="auto", labels=dict(x="Predicted", y="True", color="Count"),
                 x=['Player 2 Wins', 'Player 1 Wins'], y=['Player 2 Wins', 'Player 1 Wins'], color_continuous_scale='Blues')
fig3.update_layout(title='Confusion Matrix', template='plotly_dark')
fig3.show()

Best Parameters (XGBoost): {'scale_pos_weight': 0.9964551331062764, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}
Model Evaluation (XGBoost):
Accuracy: 0.66
Precision: 0.66
Recall: 0.65
F1-Score: 0.65
AUC: 0.72

Full Classification Report:
               precision    recall  f1-score   support

Player 2 Wins       0.65      0.66      0.66      3600
Player 1 Wins       0.66      0.65      0.65      3581

     accuracy                           0.66      7181
    macro avg       0.66      0.66      0.66      7181
 weighted avg       0.66      0.66      0.66      7181


Feature Importance (XGBoost):
                           Feature  Importance
20                  poly_rank_diff    0.488670
1                          best_of    0.051958
7                            Round    0.043823
6                           Series    0.043562
25      poly_weight_diff_rank_diff    0.042200
8                    player_1_flag    0.028809
9                    player_2_flag    0.027240
17  ran

### Catboost

In [28]:
# Measure execution time
start_time = time.time()

# Data preparation
df_ml = df.reset_index(drop=True)
np.random.seed(42)
mask = np.random.random(len(df_ml)) > 0.5
df_ml['player_1_height'] = np.where(mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_2_height'] = np.where(~mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_1_weight'] = np.where(mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_2_weight'] = np.where(~mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_1_hand'] = np.where(mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_2_hand'] = np.where(~mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_1_flag'] = np.where(mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_2_flag'] = np.where(~mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_1_year_pro'] = np.where(mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_2_year_pro'] = np.where(~mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_1_rank'] = np.where(mask, df_ml['WRank'], df_ml['LRank'])
df_ml['player_2_rank'] = np.where(~mask, df_ml['WRank'], df_ml['LRank'])
df_ml['Winner_Indicator'] = np.where(mask, 1, 0)

# Feature engineering
df_ml['height_diff'] = df_ml['player_1_height'] - df_ml['player_2_height']
df_ml['weight_diff'] = df_ml['player_1_weight'] - df_ml['player_2_weight']
df_ml['rank_diff'] = df_ml['player_1_rank'] - df_ml['player_2_rank']
df_ml['year_pro_diff'] = df_ml['player_1_year_pro'] - df_ml['player_2_year_pro']
df_ml['height_weight_interaction'] = df_ml['height_diff'] * df_ml['weight_diff']
df_ml['best_of'] = df_ml['Best of'].fillna(df_ml['Best of'].median())
df_ml['year'] = df_ml['Year'].fillna(df_ml['Year'].median())
df_ml['custom_log_rank_diff'] = np.log1p(df_ml['rank_diff'].abs()) * np.sign(df_ml['rank_diff'])
df_ml['custom_rank_year_pro_interaction'] = df_ml['rank_diff'] * df_ml['year_pro_diff']
surface_dummies = pd.get_dummies(df_ml['Surface'].astype(str).fillna('Unknown'), prefix='surface')
court_dummies = pd.get_dummies(df_ml['Court'].astype(str).fillna('Unknown'), prefix='court')
df_ml = pd.concat([df_ml, surface_dummies, court_dummies], axis=1)
for col in surface_dummies.columns:
    df_ml[f'custom_rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]
for col in court_dummies.columns:
    df_ml[f'custom_rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]

le = LabelEncoder()
for col in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']:
    df_ml[col] = le.fit_transform(df_ml[col].astype(str).fillna('Unknown'))

# Forming feature list
base_features = ['year_pro_diff', 'best_of', 'year', 'player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court', 'height_weight_interaction', 'custom_log_rank_diff', 'custom_rank_year_pro_interaction']
interaction_features = [col for col in df_ml.columns if 'custom_rank_' in col and 'interaction' in col]
features = list(dict.fromkeys(base_features + interaction_features))

# Checking for duplicates
if len(features) != len(set(features)):
    duplicates = [item for item in set(features) if features.count(item) > 1]
    raise ValueError(f"Duplicates found in feature list: {duplicates}")
df_ml_features = df_ml[features + ['Winner_Indicator', 'AvgW', 'LRank']].dropna()
print(f"Rows after .dropna(): {len(df_ml_features)}")

# Model training
X = df_ml_features.drop(columns=['Winner_Indicator', 'AvgW', 'LRank'])
y = df_ml_features['Winner_Indicator']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
numeric_cols = [col for col in features if col not in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']]
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Simplified hyperparameter search
param_dist = {
    'iterations': [300, 400, 500],
    'depth': [10, 12],
    'learning_rate': [0.05],
    'l2_leaf_reg': [5],
    'auto_class_weights': ['Balanced']
}
cat_model = RandomizedSearchCV(cb.CatBoostClassifier(random_state=42, logging_level='Silent', early_stopping_rounds=20),
                              param_dist, n_iter=5, cv=3, scoring='accuracy', random_state=42)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
y_pred_prob_cat = cat_model.predict_proba(X_test)[:, 1]

# Calibration
cat_calibrated = CalibratedClassifierCV(cat_model.best_estimator_, method='isotonic', cv=5)
cat_calibrated.fit(X_train, y_train)
y_pred_prob_cat_cal = cat_calibrated.predict_proba(X_test)[:, 1]

# Metrics
accuracy_cat = accuracy_score(y_test, y_pred_cat)
precision_cat = precision_score(y_test, y_pred_cat)
recall_cat = recall_score(y_test, y_pred_cat)
f1_cat = f1_score(y_test, y_pred_cat)
auc_cat = roc_auc_score(y_test, y_pred_prob_cat_cal)

print("Best Parameters (CatBoost):", cat_model.best_params_)
print("Model Evaluation (CatBoost):")
print(f"Accuracy: {accuracy_cat:.2f}")
print(f"Precision: {precision_cat:.2f}")
print(f"Recall: {recall_cat:.2f}")
print(f"F1-Score: {f1_cat:.2f}")
print(f"AUC: {auc_cat:.2f}")
print("\nFull Classification Report:")
print(classification_report(y_test, y_pred_cat, target_names=['Player 2 Wins', 'Player 1 Wins']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': cat_model.best_estimator_.get_feature_importance()
})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("\nFeature Importance (CatBoost):")
print(feature_importance.head(10))

# Visualizing feature importance
fig_importance = px.bar(feature_importance.head(10), x='Importance', y='Feature', orientation='h',
                        title='Top 10 Feature Importance (CatBoost)', color='Importance', color_continuous_scale='Blues')
fig_importance.update_layout(template='plotly_dark')
fig_importance.show()

# Plots
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_cat_cal)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_cat_cal)
cm = confusion_matrix(y_test, y_pred_cat)

fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC = {auc_cat:.2f}', line=dict(color='lightblue')))
fig1.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash', color='white'), name='Random'))
fig1.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', template='plotly_dark')
fig1.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall Curve', line=dict(color='lightblue')))
fig2.update_layout(title='Precision-Recall Curve', xaxis_title='Recall', yaxis_title='Precision', template='plotly_dark')
fig2.show()

fig3 = px.imshow(cm, text_auto=True, aspect="auto", labels=dict(x="Predicted", y="True", color="Count"),
                 x=['Player 2 Wins', 'Player 1 Wins'], y=['Player 2 Wins', 'Player 1 Wins'], color_continuous_scale='Blues')
fig3.update_layout(title='Confusion Matrix', template='plotly_dark')
fig3.show()

# Output execution time
end_time = time.time()
print(f"Total execution time: {(end_time - start_time)/60:.2f} minutes")

Rows after .dropna(): 35904
Best Parameters (CatBoost): {'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 300, 'depth': 10, 'auto_class_weights': 'Balanced'}
Model Evaluation (CatBoost):
Accuracy: 0.66
Precision: 0.66
Recall: 0.65
F1-Score: 0.66
AUC: 0.72

Full Classification Report:
               precision    recall  f1-score   support

Player 2 Wins       0.66      0.67      0.66      3600
Player 1 Wins       0.66      0.65      0.66      3581

     accuracy                           0.66      7181
    macro avg       0.66      0.66      0.66      7181
 weighted avg       0.66      0.66      0.66      7181


Feature Importance (CatBoost):
                                  Feature  Importance
8                           player_1_flag   11.860842
9                           player_2_flag   10.584186
12                   custom_log_rank_diff    9.505924
11              height_weight_interaction    8.894814
2                                    year    8.625935
7                   

Total execution time: 10.37 minutes


### CatBoost Extended

In [29]:
# Measure execution time
start_time = time.time()

# Check for required columns
required_cols = ['pl1_height', 'pl2_height', 'pl1_weight', 'pl2_weight', 'pl1_hand', 'pl2_hand', 'pl1_flag', 'pl2_flag',
                 'pl1_year_pro', 'pl2_year_pro', 'WRank', 'LRank', 'B365W', 'B365L', 'AvgW', 'AvgL', 'Surface', 'Court',
                 'Series', 'Round', 'Best of', 'Year']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in dataset: {missing_cols}")

# Data preparation
df_ml = df.reset_index(drop=True)
np.random.seed(42)
mask = np.random.random(len(df_ml)) > 0.5
df_ml['player_1_height'] = np.where(mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_2_height'] = np.where(~mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_1_weight'] = np.where(mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_2_weight'] = np.where(~mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_1_hand'] = np.where(mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_2_hand'] = np.where(~mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_1_flag'] = np.where(mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_2_flag'] = np.where(~mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_1_year_pro'] = np.where(mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_2_year_pro'] = np.where(~mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_1_rank'] = np.where(mask, df_ml['WRank'], df_ml['LRank'])
df_ml['player_2_rank'] = np.where(~mask, df_ml['WRank'], df_ml['LRank'])
df_ml['player_1_B365'] = np.where(mask, df_ml['B365W'], df_ml['B365L'])
df_ml['player_2_B365'] = np.where(~mask, df_ml['B365W'], df_ml['B365L'])
df_ml['player_1_Avg'] = np.where(mask, df_ml['AvgW'], df_ml['AvgL'])
df_ml['player_2_Avg'] = np.where(~mask, df_ml['AvgW'], df_ml['AvgL'])
df_ml['Winner_Indicator'] = np.where(mask, 1, 0)

# Calculate years since professional debut
df_ml['year'] = df_ml['Year'].fillna(df_ml['Year'].median())
df_ml['player_1_years_since_pro'] = df_ml['year'] - df_ml['player_1_year_pro']
df_ml['player_2_years_since_pro'] = df_ml['year'] - df_ml['player_2_year_pro']

# Function to calculate bookmaker probabilities
def calculate_bookmaker_probs(df):
    df['B365W_prob'] = 1 / df['player_1_B365'] / (1 / df['player_1_B365'] + 1 / df['player_2_B365'])
    df['AvgW_prob'] = 1 / df['player_1_Avg'] / (1 / df['player_1_Avg'] + 1 / df['player_2_Avg'])
    return df

df_ml = calculate_bookmaker_probs(df_ml)

# Feature engineering
df_ml['height_diff'] = df_ml['player_1_height'] - df_ml['player_2_height']
df_ml['weight_diff'] = df_ml['player_1_weight'] - df_ml['player_2_weight']
df_ml['rank_diff'] = df_ml['player_1_rank'] - df_ml['player_2_rank']
df_ml['year_pro_diff'] = df_ml['player_1_year_pro'] - df_ml['player_2_year_pro']
df_ml['height_weight_interaction'] = df_ml['height_diff'] * df_ml['weight_diff']
df_ml['best_of'] = df_ml['Best of'].fillna(df_ml['Best of'].median())
df_ml['custom_log_rank_diff'] = np.log1p(df_ml['rank_diff'].abs()) * np.sign(df_ml['rank_diff'])
df_ml['custom_rank_year_pro_interaction'] = df_ml['rank_diff'] * df_ml['year_pro_diff']
df_ml['bet_diff_B365'] = df_ml['player_1_B365'] - df_ml['player_2_B365']
df_ml['bet_diff_Avg'] = df_ml['player_1_Avg'] - df_ml['player_2_Avg']
df_ml['log_B365_ratio'] = np.log1p(df_ml['player_1_B365'] / df_ml['player_2_B365'])
df_ml['log_Avg_ratio'] = np.log1p(df_ml['player_1_Avg'] / df_ml['player_2_Avg'])
df_ml['bet_rank_interaction_B365'] = df_ml['bet_diff_B365'] * df_ml['rank_diff']
df_ml['bet_rank_interaction_Avg'] = df_ml['bet_diff_Avg'] * df_ml['rank_diff']

# Dummy variables
surface_dummies = pd.get_dummies(df_ml['Surface'].astype(str).fillna('Unknown'), prefix='surface')
court_dummies = pd.get_dummies(df_ml['Court'].astype(str).fillna('Unknown'), prefix='court')
df_ml = pd.concat([df_ml, surface_dummies, court_dummies], axis=1)
for col in surface_dummies.columns:
    df_ml[f'custom_rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]
for col in court_dummies.columns:
    df_ml[f'custom_rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]

# Encoding categorical features
le = LabelEncoder()
for col in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']:
    df_ml[col] = le.fit_transform(df_ml[col].astype(str).fillna('Unknown'))

# Forming feature list
base_features = ['year_pro_diff', 'best_of', 'player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round',
                 'player_1_flag', 'player_2_flag', 'Court', 'height_weight_interaction', 'custom_log_rank_diff',
                 'custom_rank_year_pro_interaction', 'player_1_B365', 'player_2_B365', 'player_1_Avg', 'player_2_Avg',
                 'bet_diff_B365', 'bet_diff_Avg', 'log_B365_ratio', 'log_Avg_ratio', 'bet_rank_interaction_B365',
                 'bet_rank_interaction_Avg', 'player_1_years_since_pro', 'player_2_years_since_pro']
interaction_features = [col for col in df_ml.columns if 'custom_rank_' in col and 'interaction' in col]
features = list(dict.fromkeys(base_features + interaction_features))

# Checking for duplicates
if len(features) != len(set(features)):
    duplicates = [item for item in set(features) if features.count(item) > 1]
    raise ValueError(f"Duplicates found in feature list: {duplicates}")

# Preparing data for the model
df_ml_features = df_ml[features + ['Winner_Indicator', 'B365W_prob', 'AvgW_prob']].dropna()
print(f"Rows after .dropna(): {len(df_ml_features)}")

# Splitting data
X = df_ml_features.drop(columns=['Winner_Indicator', 'B365W_prob', 'AvgW_prob'])
y = df_ml_features['Winner_Indicator']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling numeric features
scaler = StandardScaler()
numeric_cols = [col for col in features if col not in ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']]
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Training CatBoost
cat_model_extended = cb.CatBoostClassifier(
    iterations=300, depth=10, learning_rate=0.05, l2_leaf_reg=5, auto_class_weights='Balanced',
    random_state=42, logging_level='Silent', early_stopping_rounds=20,
    cat_features=['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']
)
cat_model_extended.fit(X_train, y_train)

# Predictions
y_pred = cat_model_extended.predict(X_test)
y_pred_prob = cat_model_extended.predict_proba(X_test)[:, 1]

# Calibration with Platt Scaling
cat_calibrated = CalibratedClassifierCV(cat_model_extended, method='sigmoid', cv=5)
cat_calibrated.fit(X_train, y_train)
y_pred_prob_cal = cat_calibrated.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_prob_cal)
rmse_b365 = np.sqrt(mean_squared_error(df_ml_features.loc[y_test.index, 'B365W_prob'], y_pred_prob_cal))
rmse_avg = np.sqrt(mean_squared_error(df_ml_features.loc[y_test.index, 'AvgW_prob'], y_pred_prob_cal))

# Output results
print("Model Evaluation (CatBoost):")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"AUC: {auc:.2f}")
print(f"RMSE (B365W_prob): {rmse_b365:.4f}")
print(f"RMSE (AvgW_prob): {rmse_avg:.4f}")
print("\nFull Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Player 2 Wins', 'Player 1 Wins']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': cat_model_extended.get_feature_importance()
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance.head(10))

# Visualizing feature importance
fig_importance = px.bar(feature_importance.head(10), x='Importance', y='Feature', orientation='h',
                        title='Top 10 Feature Importance (CatBoost)', color='Importance', color_continuous_scale='Blues')
fig_importance.update_layout(template='plotly_dark')
fig_importance.show()

# ROC and Precision-Recall Plots
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_cal)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_cal)
cm = confusion_matrix(y_test, y_pred)

fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC = {auc:.2f}', line=dict(color='lightblue')))
fig1.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash', color='white'), name='Random'))
fig1.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', template='plotly_dark')
fig1.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name='PR Curve', line=dict(color='lightblue')))
fig2.update_layout(title='Precision-Recall Curve', xaxis_title='Recall', yaxis_title='Precision', template='plotly_dark')
fig2.show()

# Confusion Matrix
fig3 = px.imshow(cm, text_auto=True, aspect="auto", labels=dict(x="Predicted", y="True", color="Count"),
                 x=['Player 2 Wins', 'Player 1 Wins'], y=['Player 2 Wins', 'Player 1 Wins'], color_continuous_scale='Blues')
fig3.update_layout(title='Confusion Matrix', template='plotly_dark')
fig3.show()

# Visualization of probability comparison
fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=df_ml_features.loc[y_test.index, 'B365W_prob'], y=y_pred_prob_cal, mode='markers', name='Model vs B365', marker=dict(color='lightblue')))
fig4.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash', color='white'), name='Ideal'))
fig4.update_layout(title='Model Probabilities vs B365 Probabilities', xaxis_title='B365 Probabilities', yaxis_title='Model Probabilities', template='plotly_dark')
fig4.show()

fig5 = go.Figure()
fig5.add_trace(go.Scatter(x=df_ml_features.loc[y_test.index, 'AvgW_prob'], y=y_pred_prob_cal, mode='markers', name='Model vs Avg', marker=dict(color='lightblue')))
fig5.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash', color='white'), name='Ideal'))
fig5.update_layout(title='Model Probabilities vs Avg Probabilities', xaxis_title='Avg Probabilities', yaxis_title='Model Probabilities', template='plotly_dark')
fig5.show()

# Saving results
results = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'RMSE_B365': rmse_b365,
    'RMSE_Avg': rmse_avg
}
results_df = pd.DataFrame([results])

# Output execution time
end_time = time.time()
print(f"Total execution time: {(end_time - start_time)/60:.2f} minutes")

Rows after .dropna(): 35904
Model Evaluation (CatBoost):
Accuracy: 0.75
Precision: 0.75
Recall: 0.75
F1-Score: 0.75
AUC: 0.85
RMSE (B365W_prob): 0.1661
RMSE (AvgW_prob): 0.1425

Full Classification Report:
               precision    recall  f1-score   support

Player 2 Wins       0.75      0.75      0.75      3600
Player 1 Wins       0.75      0.75      0.75      3581

     accuracy                           0.75      7181
    macro avg       0.75      0.75      0.75      7181
 weighted avg       0.75      0.75      0.75      7181


Feature Importance:
           Feature  Importance
20   log_Avg_ratio   13.858194
18    bet_diff_Avg    8.511314
15    player_1_Avg    8.051255
16    player_2_Avg    7.318696
8    player_2_flag    6.985196
7    player_1_flag    6.602066
5           Series    5.591678
6            Round    4.767976
13   player_1_B365    4.702325
19  log_B365_ratio    3.820075


Total execution time: 4.99 minutes


### Comparison

Cell 1: Comparison of Model Accuracy

In [33]:
# Data for Accuracy
data = {
    'Model': ['Lasso', 'XGBoost', 'CatBoost', 'CatBoost Extended'],
    'Accuracy': [0.65, 0.66, 0.66, 0.75]
}

df = pd.DataFrame(data)

# Plotting the chart
fig = px.bar(df, x='Model', y='Accuracy',
             title='Comparison of Model Accuracy',
             color='Accuracy', color_continuous_scale='Blues',
             text='Accuracy')
fig.update_traces(texttemplate='%{text:.2f}', textposition='auto')
fig.update_layout(template='plotly_dark', xaxis_title='Model', yaxis_title='Accuracy')
fig.show()

Cell 2: Comparison of Model AUC

In [34]:
# Data for AUC
data = {
    'Model': ['Lasso', 'XGBoost', 'CatBoost', 'CatBoost Extended'],
    'AUC': [0.69, 0.72, 0.72, 0.85]
}

df = pd.DataFrame(data)

# Plotting the chart
fig = px.bar(df, x='Model', y='AUC',
             title='Comparison of Model AUC',
             color='AUC', color_continuous_scale='Blues',
             text='AUC')
fig.update_traces(texttemplate='%{text:.2f}', textposition='auto')
fig.update_layout(template='plotly_dark', xaxis_title='Model', yaxis_title='AUC')
fig.show()

Cell 3: Comparison of Model F1-Score

In [35]:
# Data for F1-Score
data = {
    'Model': ['Lasso', 'XGBoost', 'CatBoost', 'CatBoost Extended'],
    'F1-Score': [0.66, 0.65, 0.66, 0.75]
}

df = pd.DataFrame(data)

# Plotting the chart
fig = px.bar(df, x='Model', y='F1-Score',
             title='Comparison of Model F1-Score',
             color='F1-Score', color_continuous_scale='Blues',
             text='F1-Score')
fig.update_traces(texttemplate='%{text:.2f}', textposition='auto')
fig.update_layout(template='plotly_dark', xaxis_title='Model', yaxis_title='F1-Score')
fig.show()

### Saving models to separate files

In [36]:
import joblib
from catboost import CatBoostClassifier

# Сохранение Lasso модели
joblib.dump(lasso_model, "lasso_model.pkl")

# Сохранение XGBoost модели
joblib.dump(xgb_model, "xgboost_model.pkl")

# Сохранение CatBoost модели (первая версия)
cat_model.best_estimator_.save_model("catboost_model.cbm")

# Сохранение CatBoost модели (расширенная версия)
cat_model_extended.save_model("catboost_extended_model.cbm")

# Two main hypothesis

Data preparation

In [40]:
# Data Preparation
df_ml = df.reset_index(drop=True)
np.random.seed(42)
mask = np.random.random(len(df_ml)) > 0.5
df_ml['player_1_height'] = np.where(mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_1_weight'] = np.where(mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_1_hand'] = np.where(mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_1_flag'] = np.where(mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_1_year_pro'] = np.where(mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_1_rank'] = np.where(mask, df_ml['WRank'], df_ml['LRank'])
df_ml['player_1_B365'] = np.where(mask, df_ml['B365W'], df_ml['B365L'])
df_ml['player_1_Avg'] = np.where(mask, df_ml['AvgW'], df_ml['AvgL'])
df_ml['player_2_height'] = np.where(~mask, df_ml['pl1_height'], df_ml['pl2_height'])
df_ml['player_2_weight'] = np.where(~mask, df_ml['pl1_weight'], df_ml['pl2_weight'])
df_ml['player_2_hand'] = np.where(~mask, df_ml['pl1_hand'], df_ml['pl2_hand'])
df_ml['player_2_flag'] = np.where(~mask, df_ml['pl1_flag'], df_ml['pl2_flag'])
df_ml['player_2_year_pro'] = np.where(~mask, df_ml['pl1_year_pro'], df_ml['pl2_year_pro'])
df_ml['player_2_rank'] = np.where(~mask, df_ml['WRank'], df_ml['LRank'])
df_ml['player_2_B365'] = np.where(~mask, df_ml['B365W'], df_ml['B365L'])
df_ml['player_2_Avg'] = np.where(~mask, df_ml['AvgW'], df_ml['AvgL'])
df_ml['Winner_Indicator'] = np.where(mask, 1, 0)

# Handling Missing Values
df_ml = df_ml.dropna(subset=['Wsets', 'Lsets', 'AvgW', 'AvgL'])
df_ml['Match_Duration'] = df_ml['Wsets'] + df_ml['Lsets']
df_ml['Year'] = df_ml['Year'].fillna(df_ml['Year'].median())
df_ml['best_of'] = df_ml['Best of'].fillna(df_ml['Best of'].median())

# Feature Engineering
df_ml['height_diff'] = df_ml['player_1_height'] - df_ml['player_2_height']
df_ml['weight_diff'] = df_ml['player_1_weight'] - df_ml['player_2_weight']
df_ml['rank_diff'] = df_ml['player_1_rank'] - df_ml['player_2_rank']
df_ml['year_pro_diff'] = df_ml['player_1_year_pro'] - df_ml['player_2_year_pro']
df_ml['height_weight_interaction'] = df_ml['height_diff'] * df_ml['weight_diff']
df_ml['custom_log_rank_diff'] = np.log1p(df_ml['rank_diff'].abs()) * np.sign(df_ml['rank_diff'])
df_ml['custom_rank_year_pro_interaction'] = df_ml['rank_diff'] * df_ml['year_pro_diff']
df_ml['bet_diff_B365'] = df_ml['player_1_B365'] - df_ml['player_2_B365']
df_ml['bet_diff_Avg'] = df_ml['player_1_Avg'] - df_ml['player_2_Avg']
df_ml['log_B365_ratio'] = np.log1p(df_ml['player_1_B365'] / df_ml['player_2_B365'])
df_ml['log_Avg_ratio'] = np.log1p(df_ml['player_1_Avg'] / df_ml['player_2_Avg'])
surface_dummies = pd.get_dummies(df_ml['Surface'].astype(str).fillna('Unknown'), prefix='surface')
court_dummies = pd.get_dummies(df_ml['Court'].astype(str).fillna('Unknown'), prefix='court')
df_ml = pd.concat([df_ml, surface_dummies, court_dummies], axis=1)
for col in surface_dummies.columns:
    df_ml[f'rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]
for col in court_dummies.columns:
    df_ml[f'rank_{col}_interaction'] = df_ml['rank_diff'] * df_ml[col]

# Encoding
le_surface = LabelEncoder()
le_round = LabelEncoder()
df_ml['Surface'] = le_surface.fit_transform(df_ml['Surface'].astype(str).fillna('Unknown'))
df_ml['Round'] = le_round.fit_transform(df_ml['Round'].astype(str).fillna('Unknown'))
print("Surface Encoding:", dict(zip(le_surface.classes_, le_surface.transform(le_surface.classes_))))
print("Round Encoding:", dict(zip(le_round.classes_, le_round.transform(le_round.classes_))))
for col in ['player_1_hand', 'player_2_hand', 'Series', 'player_1_flag', 'player_2_flag', 'Court']:
    df_ml[col] = LabelEncoder().fit_transform(df_ml[col].astype(str).fillna('Unknown'))

# Data Validation
print("Missing Values in Wsets/Lsets:", df_ml[['Wsets', 'Lsets']].isna().sum())
print("Missing Values in AvgW/AvgL:", df_ml[['AvgW', 'AvgL']].isna().sum())
print("Unique Surface Values:", df_ml['Surface'].unique())
print("Unique Round Values:", df_ml['Round'].unique())
print("Unique best_of Values:", df_ml['best_of'].unique())

# Functions
def perform_ks_test_with_bootstrap(data1, data2, hypothesis_name, n_bootstrap=1000, alpha=0.05):
    print(f"\n{hypothesis_name}")
    print(f"Sample Size 1: {len(data1)}, Sample Size 2: {len(data2)}")
    if len(data1) == 0 or len(data2) == 0:
        print("Error: One of the samples is empty.")
        return None, None, None, None
    stat, p_value = ks_2samp(data1, data2)
    result = "H0 confirmed" if p_value >= alpha else "H0 rejected"
    print(f"KS Statistic: {stat:.4f}, p-value: {p_value:.4f}, Result: {result}")
    n1, n2 = len(data1), len(data2)
    c_alpha = 1.36
    d_crit = c_alpha * np.sqrt((n1 + n2) / (n1 * n2))
    print(f"Critical Value D: {d_crit:.4f}")
    bootstrap_stats = []
    for _ in range(n_bootstrap):
        sample1 = np.random.choice(data1, size=n1, replace=True)
        sample2 = np.random.choice(data2, size=n2, replace=True)
        bs_stat, _ = ks_2samp(sample1, sample2)
        bootstrap_stats.append(bs_stat)
    ci_lower = np.percentile(bootstrap_stats, 2.5)
    ci_upper = np.percentile(bootstrap_stats, 97.5)
    print(f"95% Confidence Interval for KS: [{ci_lower:.4f}, {ci_upper:.4f}]")
    return stat, p_value, (ci_lower, ci_upper), bootstrap_stats

def plot_distributions_plotly(data1, data2, title, label1, label2, var_name):
    sorted_data1 = np.sort(data1)
    sorted_data2 = np.sort(data2)
    cdf1 = np.arange(1, len(sorted_data1) + 1) / len(sorted_data1)
    cdf2 = np.arange(1, len(sorted_data2) + 1) / len(sorted_data2)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=sorted_data1, y=cdf1, mode='lines', name=label1, line=dict(color='rgb(0, 191, 255)')))
    fig.add_trace(go.Scatter(x=sorted_data2, y=cdf2, mode='lines', name=label2, line=dict(color='rgb(148, 0, 211)')))
    fig.update_layout(title=f"CDF for {var_name} ({title})", xaxis_title=var_name, yaxis_title="CDF", template="plotly_white")
    if var_name.startswith("Avg"):
        fig.update_xaxes(type="log")
    fig.show()
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=data1, name=label1, histnorm='probability density', nbinsx=30, opacity=0.6, marker_color='rgb(0, 191, 255)'))
    fig.add_trace(go.Histogram(x=data2, name=label2, histnorm='probability density', nbinsx=30, opacity=0.4, marker_color='rgb(148, 0, 211)'))
    fig.update_layout(title=f"Histogram for {var_name} ({title})", xaxis_title=var_name, yaxis_title="Density", barmode='overlay', template="plotly_white")
    if var_name.startswith("Avg"):
        fig.update_xaxes(type="log")
    fig.show()
    box_data = df_ml[df_ml['Surface'].isin([1, 3])] if var_name == "Match_Duration" else df_ml[df_ml['Round'].isin([0, 1, 6, 7])]
    x_col = 'Surface' if var_name == "Match_Duration" else 'Round'
    fig = px.box(box_data, x=x_col, y=var_name, title=f"{var_name} by {x_col}", color_discrete_sequence=px.colors.sequential.Blues[2:])
    fig.update_layout(template="plotly_white")
    fig.show()

def plot_bootstrap_ci_plotly(bootstrap_stats, stat, ci, hypothesis_name):
    hist, bins = np.histogram(bootstrap_stats, bins=30)
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=bootstrap_stats, nbinsx=30, name='Bootstrap KS', marker_color='rgb(100, 200, 255)', opacity=0.6))
    fig.add_trace(go.Scatter(x=[stat, stat], y=[0, max(hist)*1.1], mode='lines', name=f'KS Statistic = {stat:.4f}', line=dict(color='red', dash='dash')))
    fig.add_trace(go.Scatter(x=[ci[0], ci[0]], y=[0, max(hist)*1.1], mode='lines', name=f'CI 2.5% = {ci[0]:.4f}', line=dict(color='green')))
    fig.add_trace(go.Scatter(x=[ci[1], ci[1]], y=[0, max(hist)*1.1], mode='lines', name=f'CI 97.5% = {ci[1]:.4f}', line=dict(color='green')))
    fig.update_layout(title=f"Bootstrap Distribution ({hypothesis_name})", xaxis_title="KS Statistic", yaxis_title="Frequency", template="plotly_white")
    fig.show()

def analyze_model_features(df, features, target):
    available_features = [f for f in features if f in df.columns]
    if not available_features:
        print(f"Error: {features} not found.")
        return None
    X = df[available_features].dropna()
    y = df.loc[X.index, target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    numeric_cols = [col for col in available_features if df[col].dtype in ['int64', 'float64']]
    if numeric_cols:
        X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
        X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
    cat_features = ['player_1_hand', 'player_2_hand', 'Surface', 'Series', 'Round', 'player_1_flag', 'player_2_flag', 'Court']
    cat_features = [f for f in cat_features if f in available_features]
    model = CatBoostClassifier(iterations=300, depth=6, learning_rate=0.1, l2_leaf_reg=3, auto_class_weights='Balanced',
                               random_state=42, logging_level='Silent', early_stopping_rounds=20, cat_features=cat_features)
    model.fit(X_train, y_train)
    importance = pd.DataFrame({'Feature': X.columns, 'Importance': model.get_feature_importance()})
    importance = importance.sort_values(by='Importance', ascending=False)
    print(f"\nFeature Importance (CatBoost) for {target}:")
    print(importance.head(5))
    fig = px.bar(importance.head(5), x='Importance', y='Feature', title=f"Feature Importance (CatBoost) for {target}", color='Importance', color_continuous_scale='Blues')
    fig.update_layout(template="plotly_white")
    fig.show()
    return importance

Surface Encoding: {'Carpet': 0, 'Clay': 1, 'Grass': 2, 'Hard': 3}
Round Encoding: {'1st Round': 0, '2nd Round': 1, '3rd Round': 2, '4th Round': 3, 'Quarterfinals': 4, 'Round Robin': 5, 'Semifinals': 6, 'The Final': 7}
Missing Values in Wsets/Lsets: Wsets    0
Lsets    0
dtype: int64
Missing Values in AvgW/AvgL: AvgW    0
AvgL    0
dtype: int64
Unique Surface Values: [3 1 2 0]
Unique Round Values: [0 1 4 6 7 2 3 5]
Unique best_of Values: [3 5]


The distributions of Match_Duration (in sets, Wsets + Lsets) on Hard and Clay are the same for matches with the same format (best_of=3 and best_of=5 separately).

In [41]:
# Cell 2: Hypothesis 1 (Match_Duration on Hard vs Clay, best_of=3 and best_of=5)
# Hypothesis 1: best_of=3
hyp1_data_hard_bo3 = df_ml[(df_ml['Surface'] == 3) & (df_ml['best_of'] == 3)]['Match_Duration'].dropna()
hyp1_data_clay_bo3 = df_ml[(df_ml['Surface'] == 1) & (df_ml['best_of'] == 3)]['Match_Duration'].dropna()
stat1_bo3, p1_bo3, ci1_bo3, bootstrap_stats_hyp1_bo3 = perform_ks_test_with_bootstrap(
    hyp1_data_hard_bo3, hyp1_data_clay_bo3, "Hypothesis 1: Match_Duration on Hard vs Clay (best_of=3)"
)
if stat1_bo3 is not None:
    plot_distributions_plotly(hyp1_data_hard_bo3, hyp1_data_clay_bo3, "Hard vs Clay (best_of=3)", "Hard", "Clay", "Match_Duration")
    plot_bootstrap_ci_plotly(bootstrap_stats_hyp1_bo3, stat1_bo3, ci1_bo3, "Hypothesis 1 (best_of=3)")
features_hyp1 = ['Surface', 'Year', 'Match_Duration', 'player_1_rank', 'best_of', 'rank_surface_Hard_interaction', 'rank_surface_Clay_interaction', 'Round']
importance_hyp1_bo3 = analyze_model_features(df_ml[df_ml['best_of'] == 3], features_hyp1, 'Winner_Indicator')
print("Hard (best_of=3):", len(hyp1_data_hard_bo3))
print("Clay (best_of=3):", len(hyp1_data_clay_bo3))

# Hypothesis 1: best_of=5
hyp1_data_hard_bo5 = df_ml[(df_ml['Surface'] == 3) & (df_ml['best_of'] == 5)]['Match_Duration'].dropna()
hyp1_data_clay_bo5 = df_ml[(df_ml['Surface'] == 1) & (df_ml['best_of'] == 5)]['Match_Duration'].dropna()
stat1_bo5, p1_bo5, ci1_bo5, bootstrap_stats_hyp1_bo5 = perform_ks_test_with_bootstrap(
    hyp1_data_hard_bo5, hyp1_data_clay_bo5, "Hypothesis 1: Match_Duration on Hard vs Clay (best_of=5)"
)
if stat1_bo5 is not None:
    plot_distributions_plotly(hyp1_data_hard_bo5, hyp1_data_clay_bo5, "Hard vs Clay (best_of=5)", "Hard", "Clay", "Match_Duration")
    plot_bootstrap_ci_plotly(bootstrap_stats_hyp1_bo5, stat1_bo5, ci1_bo5, "Hypothesis 1 (best_of=5)")
importance_hyp1_bo5 = analyze_model_features(df_ml[df_ml['best_of'] == 5], features_hyp1, 'Winner_Indicator')
print("Hard (best_of=5):", len(hyp1_data_hard_bo5))
print("Clay (best_of=5):", len(hyp1_data_clay_bo5))

# Comparison of best_of=3 and best_of=5
print("\nComparison of results for best_of=3 and best_of=5:")
print(f"best_of=3: Hard={len(hyp1_data_hard_bo3)}, Clay={len(hyp1_data_clay_bo3)}, KS Statistic={stat1_bo3:.4f}, p-value={p1_bo3:.4f}, Result={'H0 confirmed' if p1_bo3 >= 0.05 else 'H0 rejected'}")
print(f"best_of=5: Hard={len(hyp1_data_hard_bo5)}, Clay={len(hyp1_data_clay_bo5)}, KS Statistic={stat1_bo5:.4f}, p-value={p1_bo5:.4f}, Result={'H0 confirmed' if p1_bo5 >= 0.05 else 'H0 rejected'}")
if p1_bo3 >= 0.05 and p1_bo5 >= 0.05:
    print("Conclusion: Match_Duration distributions are similar for both formats (best_of=3 and best_of=5), tournament format does not affect differences between Hard and Clay.")
elif p1_bo3 < 0.05 and p1_bo5 < 0.05:
    print("Conclusion: Match_Duration distributions differ for both formats, which may be related to the influence of surface or tournaments.")
elif p1_bo3 >= 0.05:
    print("Conclusion: Distributions are similar for best_of=3 but differ for best_of=5, which may indicate the influence of format or tournaments (e.g., Grand Slams on Hard).")
else:
    print("Conclusion: Distributions are similar for best_of=5 but differ for best_of=3, which may be related to tournament types (e.g., ATP on Clay).")


Hypothesis 1: Match_Duration on Hard vs Clay (best_of=3)
Sample Size 1: 16948, Sample Size 2: 9345
KS Statistic: 0.0084, p-value: 0.7878, Result: H0 confirmed
Critical Value D: 0.0175
95% Confidence Interval for KS: [0.0015, 0.0206]



Feature Importance (CatBoost) for Winner_Indicator:
                         Feature  Importance
5  rank_surface_Hard_interaction   28.391985
3                  player_1_rank   25.944213
6  rank_surface_Clay_interaction   16.790920
1                           Year    9.232814
7                          Round    8.141534


Hard (best_of=3): 16948
Clay (best_of=3): 9345

Hypothesis 1: Match_Duration on Hard vs Clay (best_of=5)
Sample Size 1: 3626, Sample Size 2: 1748
KS Statistic: 0.0320, p-value: 0.1743, Result: H0 confirmed
Critical Value D: 0.0396
95% Confidence Interval for KS: [0.0102, 0.0601]



Feature Importance (CatBoost) for Winner_Indicator:
                         Feature  Importance
3                  player_1_rank   29.179678
5  rank_surface_Hard_interaction   21.509332
6  rank_surface_Clay_interaction   13.593399
2                 Match_Duration   11.498710
7                          Round   10.629132


Hard (best_of=5): 3626
Clay (best_of=5): 1748

Comparison of results for best_of=3 and best_of=5:
best_of=3: Hard=16948, Clay=9345, KS Statistic=0.0084, p-value=0.7878, Result=H0 confirmed
best_of=5: Hard=3626, Clay=1748, KS Statistic=0.0320, p-value=0.1743, Result=H0 confirmed
Conclusion: Match_Duration distributions are similar for both formats (best_of=3 and best_of=5), tournament format does not affect differences between Hard and Clay.


The AvgW coefficients more accurately predict wins in early rounds than in finals.

In [42]:
# Cell 3: Hypothesis 2 (AvgW in early rounds vs final rounds)
hyp2_data_early = df_ml[df_ml['Round'].isin([0, 1])]['AvgW'].dropna()
hyp2_data_final = df_ml[df_ml['Round'].isin([6, 7])]['AvgW'].dropna()
stat2, p2, ci2, bootstrap_stats_hyp2 = perform_ks_test_with_bootstrap(
    hyp2_data_early, hyp2_data_final, "Hypothesis 2: AvgW in early rounds vs final rounds"
)
if stat2 is not None:
    plot_distributions_plotly(hyp2_data_early, hyp2_data_final, "Early rounds vs Final rounds", "Early rounds", "Final rounds", "AvgW")
    plot_bootstrap_ci_plotly(bootstrap_stats_hyp2, stat2, ci2, "Hypothesis 2")
features_hyp2 = ['Round', 'player_1_Avg', 'player_2_Avg', 'log_Avg_ratio', 'bet_diff_Avg', 'player_1_rank']
importance_hyp2 = analyze_model_features(df_ml, features_hyp2, 'Winner_Indicator')
print("Early rounds:", len(hyp2_data_early))
print("Final rounds:", len(hyp2_data_final))


Hypothesis 2: AvgW in early rounds vs final rounds
Sample Size 1: 26393, Sample Size 2: 2686
KS Statistic: 0.0332, p-value: 0.0089, Result: H0 rejected
Critical Value D: 0.0275
95% Confidence Interval for KS: [0.0279, 0.0498]



Feature Importance (CatBoost) for Winner_Indicator:
         Feature  Importance
4   bet_diff_Avg   34.119567
3  log_Avg_ratio   24.109901
1   player_1_Avg   19.366796
2   player_2_Avg   13.417215
5  player_1_rank    5.510739


Early rounds: 26393
Final rounds: 2686
