In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Chargement des données
data = pd.read_csv('./Data/Data_utiles/Data_ML/infos_joueurs_2016.csv')+pd.read_csv('./Data/Data_utiles/Data_ML/infos_joueurs_2017.csv')

columns = [
    "name", "hand", "height", "rang", "matchs", "win", 
    "pourc_return_win_pnt", "pourc_break_games", "pourc_break_point_made", 
    "pourc_break_point_saved", "pourc_serv_games_win", "pourc_serv_in", 
    "mean_ranking_oppo", "pourc_serv_win_pnt", "Return Rating", 
    " % Serve Return Points Won", " % 2nd Serve Return Points Won", 
    " % Return Games Won", " % Break Points Converted", "Under Pressure Rating", 
    " % Break Point Saved", " % Break Points Converted Pressure", 
    " % Deciding Sets Won", " % Tie Breaks Won"
]
 # Complétez avec les autres noms de colonnes
df = pd.DataFrame(data, columns=columns)
print(df.head())

# Mise à jour des caractéristiques en excluant les colonnes spécifiées


# Séparation des caractéristiques et des étiquettes
features = df.drop(['name', 'rang','Under Pressure Rating', ' % Break Point Saved', 
    ' % Break Points Converted Pressure', ' % Deciding Sets Won', 
    ' % Tie Breaks Won'], axis=1)
labels = df['rang']
  # Affiche les premières lignes pour vérifier les données
features = features.dropna()

# Prétraitement : Encodage one-hot pour les variables catégorielles et normalisation pour les numériques
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['hand']),
        ('num', StandardScaler(), features.columns.drop('hand'))
    ])

# Pipeline : Prétraitement + modèle
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Entraînement du modèle
model.fit(X_train, y_train)

# Évaluation du modèle
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


                                name hand  height   rang  matchs   win  \
0         roger federerroger federer   RR   370.0   18.0    85.0  73.0   
1     mikhail youzhnymikhail youzhny   RR   366.0  141.0    64.0  26.0   
2       tommy robredofeliciano lopez   RL   368.0   79.0    56.0  27.0   
3  feliciano lopezpaul henri mathieu   LR   373.0  101.0    59.0  31.0   
4     paul henri mathieudavid ferrer   RR   360.0  110.0    82.0  42.0   

   pourc_return_win_pnt  pourc_break_games  pourc_break_point_made  \
0              0.802120           0.562124                0.818076   
1              0.720210           0.433087                0.818317   
2              0.630161           0.279107                0.629541   
3              0.654644           0.259124                0.487389   
4              0.762400           0.498125                0.818968   

   pourc_break_point_saved  ...  Return Rating   % Serve Return Points Won  \
0                 1.010020  ...            NaN          

ValueError: Found input variables with inconsistent numbers of samples: [54, 128]

In [8]:
import pandas as pd

# Exemple de données
data = pd.read_csv('./Data/Data_utiles/info_rank.csv')['ranking_date']
df = pd.DataFrame(data)

# Convertir les dates en format datetime



df['ranking_date'] = pd.to_datetime(df['ranking_date'], format='%Y%m%d')

# Trier les dates
df = df.sort_values(by='ranking_date')

# Extraire la dernière date de chaque année
last_dates = df.groupby(df['ranking_date'].dt.year)['ranking_date'].last()

# Convertir les dates dans le format d'origine (AAAAMMJJ)
formatted_dates = [date.strftime('%Y%m%d') for date in last_dates]

print(formatted_dates)

['19901231', '19911230', '19921228', '19931227', '19941226', '19951225', '19961230', '19971229', '19981228', '19991227', '20001225', '20011231', '20021230', '20031229', '20041227', '20051226', '20061225', '20071231', '20081229', '20091228', '20101227', '20111226', '20121231', '20131230', '20141229', '20151228', '20161226', '20171225', '20181231', '20191230', '20201228', '20211227', '20221226']
