# Tennis Match Prediction - Advanced Feature Engineering
## 1. Introduction
Pour améliorer notre modèle, nous allons introduire des variables dynamiques qui reflètent l'état de forme des joueurs avant le match.

**Nouvelles variables :**
- **Win Rate (Forme) :** Pourcentage de victoires sur les 10 derniers matchs.
- **Surface Win Rate :** Spécialisation du joueur sur la surface du match.
- **Elo Score (Simple) :** Un score de puissance relative.

In [None]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv("../data/processed/atp_matches_clean.csv")
df['tourney_date'] = pd.to_datetime(df['tourney_date'])
df = df.sort_values('tourney_date')

## 2. Calcul de la forme et spécialisation
Nous créons une fonction pour calculer les statistiques historiques d'un joueur avant chaque match.

In [None]:
def get_player_stats(df):
    # On crée un historique des matchs par joueur
    winners = df[['tourney_date', 'winner_id', 'surface']].copy().rename(columns={'winner_id': 'player_id'})
    winners['won'] = 1
    
    losers = df[['tourney_date', 'loser_id', 'surface']].copy().rename(columns={'loser_id': 'player_id'})
    losers['won'] = 0
    
    player_history = pd.concat([winners, losers]).sort_values(['player_id', 'tourney_date'])
    
    # Forme : Moyenne mobile des 10 derniers matchs
    player_history['rolling_win_rate'] = player_history.groupby('player_id')['won'].transform(lambda x: x.rolling(window=10, min_periods=1).mean().shift(1))
    
    # Spécialisation surface
    player_history['surface_win_rate'] = player_history.groupby(['player_id', 'surface'])['won'].transform(lambda x: x.expanding().mean().shift(1))
    
    return player_history

history = get_player_stats(df)

# On réinjecte ces stats dans le dataframe principal
df = df.merge(history, left_on=['tourney_date', 'winner_id', 'surface'], right_on=['tourney_date', 'player_id', 'surface'], how='left').rename(columns={'rolling_win_rate': 'w_form', 'surface_win_rate': 'w_surf_rate'}).drop('player_id', axis=1).drop_duplicates(subset=['tourney_id', 'match_num', 'winner_id', 'loser_id'])
df = df.merge(history, left_on=['tourney_date', 'loser_id', 'surface'], right_on=['tourney_date', 'player_id', 'surface'], how='left').rename(columns={'rolling_win_rate': 'l_form', 'surface_win_rate': 'l_surf_rate'}).drop('player_id', axis=1).drop_duplicates(subset=['tourney_id', 'match_num', 'winner_id', 'loser_id'])

## 3. Création des variables relatives finales

In [None]:
def finalize_features(df):
    features = pd.DataFrame()
    np.random.seed(42)
    mask = np.random.rand(len(df)) > 0.5
    
    # Différences classiques
    features['rank_diff'] = np.where(mask, df['loser_rank'] - df['winner_rank'], df['winner_rank'] - df['loser_rank'])
    features['age_diff'] = np.where(mask, df['winner_age'] - df['loser_age'], df['loser_age'] - df['winner_age'])
    
    # Nouvelles variables de performance
    features['form_diff'] = np.where(mask, df['w_form'] - df['l_form'], df['l_form'] - df['w_form'])
    features['surf_rate_diff'] = np.where(mask, df['w_surf_rate'] - df['l_surf_rate'], df['l_surf_rate'] - df['w_surf_rate'])
    
    features['target'] = mask.astype(int)
    return features

df_ml = finalize_features(df).dropna()
df_ml.to_csv("../data/processed/atp_matches_features_advanced.csv", index=False)
print(f"Dataset enrichi : {df_ml.shape}")