In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Chargement des données
data = pd.read_csv('./Data/Data_utiles/Data_ML/infos_joueurs_2016.csv')+pd.read_csv('./Data/Data_utiles/Data_ML/infos_joueurs_2017.csv')

columns = [
    "name", "hand", "height", "rang", "matchs", "win", 
    "pourc_return_win_pnt", "pourc_break_games", "pourc_break_point_made", 
    "pourc_break_point_saved", "pourc_serv_games_win", "pourc_serv_in", 
    "mean_ranking_oppo", "pourc_serv_win_pnt", "Return Rating", 
    " % Serve Return Points Won", " % 2nd Serve Return Points Won", 
    " % Return Games Won", " % Break Points Converted", "Under Pressure Rating", 
    " % Break Point Saved", " % Break Points Converted Pressure", 
    " % Deciding Sets Won", " % Tie Breaks Won"
]
 # Complétez avec les autres noms de colonnes
df = pd.DataFrame(data, columns=columns)
print(df.head())
df = df.dropna()
# Mise à jour des caractéristiques en excluant les colonnes spécifiées


# Séparation des caractéristiques et des étiquettes
features = df.drop(['name', 'rang','Under Pressure Rating', ' % Break Point Saved', 
    ' % Break Points Converted Pressure', ' % Deciding Sets Won', 
    ' % Tie Breaks Won'], axis=1)
labels = df['rang']
print(df.head())  # Affiche les premières lignes pour vérifier les données

# Prétraitement : Encodage one-hot pour les variables catégorielles et normalisation pour les numériques
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['hand']),
        ('num', StandardScaler(), features.columns.drop('hand'))
    ])

# Pipeline : Prétraitement + modèle
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Entraînement du modèle
model.fit(X_train, y_train)

# Évaluation du modèle
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


                             name hand  height  rang  matchs   win  \
0        lleyton hewitttommy haas   RR   368.0   NaN    17.0   5.0   
1      roger federerroger federer   RR   370.0   NaN    85.0  73.0   
2  mikhail youzhnymikhail youzhny   RR   366.0   NaN    64.0  26.0   
3      tommy robredotommy robredo   RR   360.0   NaN    23.0   7.0   
4  albert montanesalbert montanes   RR   350.0   NaN    10.0   2.0   

   pourc_return_win_pnt  pourc_break_games  pourc_break_point_made  \
0              0.669596           0.199164                0.394625   
1              0.802120           0.562124                0.818076   
2              0.720210           0.433087                0.818317   
3              0.658001           0.312636                0.670876   
4              0.536491           0.181636                0.440476   

   pourc_break_point_saved  ...  Return Rating   % Serve Return Points Won  \
0                 1.253887  ...            NaN                         NaN   
1 

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.