In [132]:
import pandas as pd
import numpy as np

In [133]:
# Load dataset
df = pd.read_csv("../data/atp_tennis.csv")

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66613 entries, 0 to 66612
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Tournament  66613 non-null  object 
 1   Date        66613 non-null  object 
 2   Series      66613 non-null  object 
 3   Court       66613 non-null  object 
 4   Surface     66613 non-null  object 
 5   Round       66613 non-null  object 
 6   Best of     66613 non-null  int64  
 7   Player_1    66613 non-null  object 
 8   Player_2    66613 non-null  object 
 9   Winner      66613 non-null  object 
 10  Rank_1      66613 non-null  int64  
 11  Rank_2      66613 non-null  int64  
 12  Pts_1       66613 non-null  int64  
 13  Pts_2       66613 non-null  int64  
 14  Odd_1       66613 non-null  float64
 15  Odd_2       66613 non-null  float64
 16  Score       66613 non-null  object 
dtypes: float64(2), int64(5), object(10)
memory usage: 8.6+ MB


In [135]:
df['Series'].value_counts()

Series
ATP250                17636
Grand Slam            12512
International         11752
Masters 1000           9221
ATP500                 6471
Masters                5029
International Gold     3620
Masters Cup             372
Name: count, dtype: int64

In [136]:
df.tail(5)

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
66608,BNP Paribas Masters,2025-10-31,Masters 1000,Indoor,Hard,Quarterfinals,3,Sinner J.,Shelton B.,Sinner J.,2,7,10500,3820,1.17,5.0,6-3 6-3
66609,BNP Paribas Masters,2025-10-31,Masters 1000,Indoor,Hard,Quarterfinals,3,Medvedev D.,Zverev A.,Zverev A.,13,3,2810,6160,1.73,2.1,6-2 3-6 6-7
66610,BNP Paribas Masters,2025-11-01,Masters 1000,Indoor,Hard,Semifinals,3,Auger-Aliassime F.,Bublik A.,Auger-Aliassime F.,10,16,3195,2520,1.91,1.91,7-6 6-4
66611,BNP Paribas Masters,2025-11-01,Masters 1000,Indoor,Hard,Semifinals,3,Zverev A.,Sinner J.,Sinner J.,3,2,6160,10500,6.0,1.13,0-6 1-6
66612,BNP Paribas Masters,2025-11-02,Masters 1000,Indoor,Hard,The Final,3,Sinner J.,Auger-Aliassime F.,Sinner J.,2,10,10500,3195,1.1,7.0,6-4 7-6


### Filtrando para os ultimos 2 anos

In [137]:
#convertendo a data para datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [138]:
#remover linhas com informacoes essenciais faltando
df = df.dropna(subset=["Player_1", "Player_2", "Winner", "Rank_1", "Rank_2", "Odd_1", "Odd_2"])

In [139]:
df = df[df["Date"] >= pd.Timestamp.now() - pd.DateOffset(years=2)].reset_index(drop=True)

In [140]:
df['Series'].value_counts()

Series
ATP250          1847
Masters 1000    1411
Grand Slam       965
ATP500           868
Masters Cup       29
Name: count, dtype: int64

#### criando features basicas

In [141]:
#diferenca de ranking
df["Rank_diff"] = df['Rank_1'] - df['Rank_2']
df["Pts_diff"] = df['Pts_1'] - df['Pts_2']

In [142]:
#Probabilidade implicita das odds
df["Prob_1"] = 1 / df['Odd_1']
df["Prob_2"] = 1 / df['Odd_2']
df["Prob_diff"] = df['Prob_1'] - df['Prob_2']

In [143]:
# Variável alvo (1 se Player_1 venceu, 0 caso contrário)
df['Target'] = (df['Winner'] == df['Player_1']).astype(int)

### Criando Win rate e H2H ponderados

In [144]:
def weighted_win_rate(player, current_date, df, years=3):
    #Filtrando jogos anteriores dentro da janela de 3 anos
    mask = (df["Date"] < current_date) & (
        (df["Player_1"] == player) | (df['Player_2'] == player)
    )
    past_matches = df.loc[mask]
    if past_matches.empty:
        return np.nan
    
    #calcular se o jogador venceu
    past_matches["Win"] = np.where(past_matches['Winner'] == player, 1, 0)

    # Calcular peso pelo tempo (mais recente = peso maior)
    past_matches["Days_diff"] = (current_date - past_matches['Date']).dt.days
    past_matches["weight"] = np.exp(-past_matches["Days_diff"] / 365)  # decai a cada ano


    # Win rate ponderado
    win_rate = np.average(past_matches["Win"], weights=past_matches["weight"])
    return win_rate

In [145]:
def weighted_h2h(player1, player2, current_date, df, years=3):
    # Filtrar confrontos diretos dentro da janela de 3 anos
    mask = (df['Date'] < current_date) & (
        ((df['Player_1'] == player1) & (df['Player_2'] == player2)) |
        ((df['Player_1'] == player2) & (df['Player_2'] == player1))
    )
    h2h_matches = df.loc[mask]

    if h2h_matches.empty:
        return np.nan

    #calcular se o player1 venceu
    h2h_matches["Win"] = np.where(h2h_matches['Winner'] == player1, 1, 0)
    h2h_matches["Days_diff"] = (current_date - h2h_matches["Date"]).dt.days
    h2h_matches["weight"] = np.exp(-h2h_matches["Days_diff"] / 365)

    h2h_score = np.average(h2h_matches["Win"], weights=h2h_matches["weight"])
    return h2h_score

In [146]:
#aplicando funcoes no dataset
df["Win_Rate_1"] = df.apply(lambda row: weighted_win_rate(row["Player_1"], row["Date"], df), axis=1)
df["Win_Rate_2"] = df.apply(lambda row: weighted_win_rate(row["Player_2"], row["Date"], df), axis=1)


In [147]:
df["H2H_1"] = df.apply(lambda row: weighted_h2h(row["Player_1"], row["Player_2"], row["Date"], df), axis=1)

#### Selecionar e normalizar as features

In [148]:
# Seleção das features mais relevantes
features = [
    'Rank_diff',
    'Pts_diff',
    'Prob_diff',
    'Win_Rate_1',
    'Win_Rate_2',
    'H2H_1'
]

In [149]:
# Removendo possíveis valores ausentes
df_modelo = df.dropna(subset=features + ['Target'])

In [150]:
# Separando X (entradas) e y (saída)
X = df_modelo[features]
y = df_modelo['Target']

In [151]:
# Dividindo entre treino e teste (80% treino, 20% teste)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [152]:
# Normalizando os dados
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Treinando o modelo de Machine Learning

In [153]:
from sklearn.linear_model import LogisticRegression

# Criando e treinando o modelo
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

In [160]:
from xgboost import XGBClassifier

# Criando o modelo
xgb_model = XGBClassifier(
    n_estimators=300,       # número de árvores
    learning_rate=0.05,     # taxa de aprendizado (menor = mais preciso, mas mais lento)
    max_depth=5,            # profundidade das árvores
    subsample=0.8,          # porcentagem de amostras usadas por árvore
    colsample_bytree=0.8,   # porcentagem de features usadas por árvore
    random_state=42,
    eval_metric="logloss"   # evita warnings
)

# Treinando o modelo
xgb_model.fit(X_train_scaled, y_train)


AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=300,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)

In [159]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Fazendo previsões no conjunto de teste
y_pred = model.predict(X_test_scaled)

# Avaliando a performance
print("Acurácia:", accuracy_score(y_test, y_pred))
print("\nMatriz de Confusão:\n", confusion_matrix(y_test, y_pred))
print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred))

Acurácia: 0.7566371681415929

Matriz de Confusão:
 [[94 32]
 [23 77]]

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.80      0.75      0.77       126
           1       0.71      0.77      0.74       100

    accuracy                           0.76       226
   macro avg       0.75      0.76      0.76       226
weighted avg       0.76      0.76      0.76       226



#### Filtrando os jogos a partir das quartas de final

In [155]:
quartas_df = df[
    (df['Tournament'].str.contains('BNP Paribas Masters', case=False)) &
    (df['Round'].str.contains('Quarterfinals', case=False)) &
    (df['Date'].dt.year == 2025)
].copy()

In [156]:
# Seleciona apenas as features usadas no treino
X_quartas = quartas_df[[
    'Rank_diff', 'Pts_diff', 'Prob_diff', 'Win_Rate_1', 'Win_Rate_2', 'H2H_1'
]]

# Remove valores faltantes, se houver
X_quartas = X_quartas.fillna(0)

# Aplica a normalização
X_quartas_scaled = scaler.transform(X_quartas)

### Gerar previsões e probabilidades

In [157]:
# Probabilidade do Player_1 vencer
quartas_df["Prob_Player1_Win"] = model.predict_proba(X_quartas_scaled)[:, 1]

# Vencedor previsto
quartas_df["Pred_Winner"] = quartas_df.apply(
    lambda row: row["Player_1"] if row["Prob_Player1_Win"] > 0.5 else row["Player_2"],
    axis=1
)

# Mostrar resultados
for _, row in quartas_df.iterrows():
    prob = row["Prob_Player1_Win"] if row["Pred_Winner"] == row["Player_1"] else 1 - row["Prob_Player1_Win"]
    print(f"{row['Player_1']} vs {row['Player_2']} → {row['Pred_Winner']} ({prob * 100:.2f}%)")


Auger-Aliassime F. vs Vacherot V. → Auger-Aliassime F. (64.95%)
De Minaur A. vs Bublik A. → De Minaur A. (56.71%)
Sinner J. vs Shelton B. → Sinner J. (86.19%)
Medvedev D. vs Zverev A. → Medvedev D. (56.40%)
