In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers, callbacks, mixed_precision
from sklearn.preprocessing import StandardScaler


In [2]:
df_ml = pd.read_parquet("df_ml.parquet")

In [3]:
df_ml

Unnamed: 0,player_A_name,player_A_ht,player_A_age,player_B_name,player_B_ht,player_B_age,outcome,tourney_datetime,player_A_avg_sets_A,player_A_avg_sets_B,...,tourney_name_reduzido_Rotterdam,tourney_name_reduzido_US Open,tourney_name_reduzido_Vienna,tourney_name_reduzido_Washington,tourney_name_reduzido_Wimbledon,tourney_level_reduzido_G,tourney_level_reduzido_M,tourney_level_reduzido_Outros,player_A_hand_reduzido_R,player_B_hand_reduzido_R
0,Kelvin Belcher,0.875000,0.294915,John Fitzgerald,0.865385,0.320132,0,1985-01-07 00:00:00,,,...,False,False,False,False,False,False,False,False,True,True
1,Mark Wooldridge,0.889423,0.271186,Karl Meiler,0.841346,0.702970,0,1985-01-07 00:00:01,,,...,False,False,False,False,False,False,False,False,True,True
2,Howard Sands,0.817308,0.284746,Jonathan Canter,0.875000,0.171617,1,1985-01-07 00:00:02,,,...,False,False,False,False,False,False,False,False,True,True
3,Russell Barlow,0.817308,0.254237,Brad Drewett,0.875000,0.399340,0,1985-01-07 00:00:03,,,...,False,False,False,False,False,False,False,False,False,False
4,David Lewis,0.850962,0.196610,Leif Shiras,0.865385,0.363036,0,1985-01-07 00:00:04,,,...,False,False,False,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129554,Alex Michelsen,0.913462,0.196610,Luca Van Assche,0.841346,0.204620,1,2024-12-18 00:00:10,2.155844,0.467532,...,False,False,False,False,False,False,False,True,True,True
129555,Alex Michelsen,0.913462,0.196610,Nishesh Basavareddy,0.850962,0.174917,1,2024-12-18 00:00:11,2.166667,0.474359,...,False,False,False,False,False,False,False,True,True,True
129556,Luca Van Assche,0.841346,0.203390,Juncheng Shang,0.850962,0.181518,1,2024-12-18 00:00:12,2.254237,0.559322,...,False,False,False,False,False,False,False,True,True,False
129557,Juncheng Shang,0.850962,0.179661,Nishesh Basavareddy,0.850962,0.174917,0,2024-12-18 00:00:13,2.150000,0.550000,...,False,False,False,False,False,False,False,True,False,True


In [None]:
# --- 1) Split temporal (70/15/15) ---
df = df_ml.sort_values('tourney_datetime').reset_index(drop=True)
df = df.fillna(0)
train_end = df['tourney_datetime'].quantile(0.70)
val_end   = df['tourney_datetime'].quantile(0.85)

train = df[df['tourney_datetime'] <= train_end]
val   = df[(df['tourney_datetime'] > train_end) & (df['tourney_datetime'] <= val_end)]
test  = df[df['tourney_datetime'] > val_end]

drop_cols = ['outcome','player_A_name','player_B_name','tourney_datetime']
X_train, y_train = train.drop(columns=drop_cols), train['outcome']
X_val,   y_val   = val.drop(columns=drop_cols),   val['outcome']
X_test,  y_test  = test.drop(columns=drop_cols),  test['outcome']


X_train_np = X_train.fillna(0).to_numpy(dtype='float32')
X_val_np   = X_val.fillna(0).to_numpy(dtype='float32')
X_test_np  = X_test.fillna(0).to_numpy(dtype='float32')
y_train_np  = y_train.values.astype('float32')
y_val_np    = y_val.values.astype('float32')
y_test_np   = y_test.values.astype('float32')

In [5]:
mixed_precision.set_global_policy('mixed_float16')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_np)
X_val_scaled   = scaler.transform(X_val_np)

model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    
    layers.Dense(128, activation=None, kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.3),
    
    layers.Dense(64, activation=None, kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
    
    layers.Dense(1, activation='sigmoid', dtype='float32'),
])


es = callbacks.EarlyStopping(
    monitor='val_auc', mode='max',
    patience=5, restore_best_weights=True
)
mc = callbacks.ModelCheckpoint(
    'best_model.keras',           
    monitor='val_auc', 
    mode='max',
    save_best_only=True,
       
)
rl = callbacks.ReduceLROnPlateau(
    monitor='val_auc', mode='max',
    factor=0.5, patience=3, min_lr=1e-6
)

lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=1e-3,
    decay_steps=10000
)

opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

model.compile(
    optimizer=opt,
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc')]
)


model.compile(
    optimizer=opt,
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc')]
)

history = model.fit(
    X_train_scaled, y_train_np,
    validation_data=(X_val_scaled, y_val_np),
    epochs=50,
    batch_size=256,
    callbacks=[es, mc],   
    verbose=2
)

Epoch 1/50
355/355 - 1s - 3ms/step - accuracy: 0.6453 - auc: 0.7011 - loss: 0.6498 - val_accuracy: 0.6901 - val_auc: 0.7607 - val_loss: 0.5960
Epoch 2/50
355/355 - 0s - 1ms/step - accuracy: 0.6616 - auc: 0.7222 - loss: 0.6282 - val_accuracy: 0.6935 - val_auc: 0.7627 - val_loss: 0.5928
Epoch 3/50
355/355 - 0s - 1ms/step - accuracy: 0.6647 - auc: 0.7273 - loss: 0.6225 - val_accuracy: 0.6925 - val_auc: 0.7623 - val_loss: 0.5913
Epoch 4/50
355/355 - 0s - 1ms/step - accuracy: 0.6670 - auc: 0.7291 - loss: 0.6197 - val_accuracy: 0.6928 - val_auc: 0.7623 - val_loss: 0.5898
Epoch 5/50
355/355 - 0s - 1ms/step - accuracy: 0.6681 - auc: 0.7309 - loss: 0.6168 - val_accuracy: 0.6922 - val_auc: 0.7618 - val_loss: 0.5903
Epoch 6/50
355/355 - 0s - 1ms/step - accuracy: 0.6684 - auc: 0.7314 - loss: 0.6155 - val_accuracy: 0.6897 - val_auc: 0.7623 - val_loss: 0.5882
Epoch 7/50
355/355 - 0s - 1ms/step - accuracy: 0.6690 - auc: 0.7320 - loss: 0.6138 - val_accuracy: 0.6902 - val_auc: 0.7621 - val_loss: 0.5873