In [16]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score


In [17]:
df_ml = pd.read_parquet("df_ml.parquet")

In [18]:
df_ml

Unnamed: 0,player_A_name,player_A_ht,player_A_age,player_B_name,player_B_ht,player_B_age,outcome,tourney_datetime,player_A_avg_sets_A,player_A_avg_sets_B,...,tourney_name_reduzido_Rotterdam,tourney_name_reduzido_US Open,tourney_name_reduzido_Vienna,tourney_name_reduzido_Washington,tourney_name_reduzido_Wimbledon,tourney_level_reduzido_G,tourney_level_reduzido_M,tourney_level_reduzido_Outros,player_A_hand_reduzido_R,player_B_hand_reduzido_R
0,Kelvin Belcher,0.875000,0.294915,John Fitzgerald,0.865385,0.320132,0,1985-01-07 00:00:00,,,...,False,False,False,False,False,False,False,False,True,True
1,Mark Wooldridge,0.889423,0.271186,Karl Meiler,0.841346,0.702970,0,1985-01-07 00:00:01,,,...,False,False,False,False,False,False,False,False,True,True
2,Howard Sands,0.817308,0.284746,Jonathan Canter,0.875000,0.171617,1,1985-01-07 00:00:02,,,...,False,False,False,False,False,False,False,False,True,True
3,Russell Barlow,0.817308,0.254237,Brad Drewett,0.875000,0.399340,0,1985-01-07 00:00:03,,,...,False,False,False,False,False,False,False,False,False,False
4,David Lewis,0.850962,0.196610,Leif Shiras,0.865385,0.363036,0,1985-01-07 00:00:04,,,...,False,False,False,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129554,Alex Michelsen,0.913462,0.196610,Luca Van Assche,0.841346,0.204620,1,2024-12-18 00:00:10,2.155844,0.467532,...,False,False,False,False,False,False,False,True,True,True
129555,Alex Michelsen,0.913462,0.196610,Nishesh Basavareddy,0.850962,0.174917,1,2024-12-18 00:00:11,2.166667,0.474359,...,False,False,False,False,False,False,False,True,True,True
129556,Luca Van Assche,0.841346,0.203390,Juncheng Shang,0.850962,0.181518,1,2024-12-18 00:00:12,2.254237,0.559322,...,False,False,False,False,False,False,False,True,True,False
129557,Juncheng Shang,0.850962,0.179661,Nishesh Basavareddy,0.850962,0.174917,0,2024-12-18 00:00:13,2.150000,0.550000,...,False,False,False,False,False,False,False,True,False,True


In [None]:
df = df_ml.sort_values('tourney_datetime').reset_index(drop=True)
df = df.fillna(0)
train_end = df['tourney_datetime'].quantile(0.70)
val_end   = df['tourney_datetime'].quantile(0.85)

train = df[df['tourney_datetime'] <= train_end]
val   = df[(df['tourney_datetime'] > train_end) & (df['tourney_datetime'] <= val_end)]
test  = df[df['tourney_datetime'] > val_end]

drop_cols = ['outcome','player_A_name','player_B_name','tourney_datetime']
X_train, y_train = train.drop(columns=drop_cols), train['outcome']
X_val,   y_val   = val.drop(columns=drop_cols),   val['outcome']
X_test,  y_test  = test.drop(columns=drop_cols),  test['outcome']

imputer = SimpleImputer(fill_value=0)
X_train_imp = imputer.fit_transform(X_train)
X_val_imp   = imputer.transform(X_val)
X_test_imp  = imputer.transform(X_test)


In [None]:
X_train_np = X_train_imp.astype('float32')
X_val_np   = X_val_imp.astype('float32')
y_train_np = y_train.values.astype('float32')
y_val_np   = y_val.values.astype('float32')

In [None]:
models = {
    'RandomForest'      : RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost'           : XGBClassifier(eval_metric='logloss', random_state=42),
    'LogisticRegression': make_pipeline(
                             StandardScaler(),
                             LogisticRegression(
                                 solver='saga',
                                 max_iter=5000,
                                 random_state=42,
                                 n_jobs=-1
                             )
                         ),
    'HistGradientBoost': HistGradientBoostingClassifier(max_iter=100, random_state=42),
}

results = {}
for name, model in models.items():
    model.fit(X_train_imp, y_train)
    preds = model.predict(X_val_imp)
    proba = model.predict_proba(X_val_imp)[:,1]
    results[name] = {
        'accuracy': accuracy_score(y_val, preds),
        'roc_auc' : roc_auc_score(y_val, proba)
    }

print(pd.DataFrame(results).T)

In [None]:
pd.DataFrame(results).T

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [200, 500, 1000],
    'max_depth': [None, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt','log2', 0.5]
}
rnd = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                         param_dist, n_iter=25, cv=3,
                         scoring='roc_auc', n_jobs=-1)
rnd.fit(X_train_imp, y_train)
print(rnd.best_params_, rnd.best_score_)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(rnd.best_estimator_, X_train_imp, y_train,
                         cv=tscv, scoring='roc_auc', n_jobs=-1)
print('TimeSeriesSplit AUC:', scores.mean())