In [16]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score



## Random Forest

Collecte et traitement des données

In [17]:
# Collecte des données
X = pd.read_csv(r"X_train.csv",index_col=0)
Y = pd.read_csv(r"Y_train.csv",index_col=0)

## Missing Data
data =  pd.merge(X, Y, left_index=True, right_index=True)
data = data.dropna(axis=1, thresh=len(data)*0.25)
data = data.dropna()
#data = data.ffill()
X = data[[col for col in X.columns if col in data.columns]]
Y = data[Y.columns]
## Format
Y=Y.spot_id_delta.to_numpy()
X = pd.get_dummies(X)
## Split
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)

Creation et Validation du modèle

In [18]:
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]

for scaler in scalers:
    # Appliquer le scaler
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialiser le modèle
    model = GradientBoostingRegressor(random_state=42)
    
    # Entraîner le modèle de base
    model.fit(X_train_scaled, y_train)
    
    # Évaluation du modèle
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Scaler: {scaler.__class__.__name__}")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")
    
    # Validation croisée
    cv_scores = sk.model_selection.cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
    print(f"Cross-Validation Scores (neg MSE): {cv_scores}")
    print(f"Mean Cross-Validation Score (neg MSE): {cv_scores.mean()}\n")


Scaler: StandardScaler
Mean Squared Error: 548.270461372465
R^2 Score: 0.34430872531907697
Cross-Validation Scores (neg MSE): [-3091.38062289  -467.14810509  -454.0677629   -597.10026905
  -693.74061228  -493.65953369  -632.09562987  -507.84349434
  -741.57853089 -3866.29700959]
Mean Cross-Validation Score (neg MSE): -1154.49115705863

Scaler: MinMaxScaler
Mean Squared Error: 548.270461372465
R^2 Score: 0.34430872531907697
Cross-Validation Scores (neg MSE): [-3091.38062289  -467.04147556  -454.0677629   -597.10026905
  -693.74061228  -493.65953369  -632.09562987  -507.86889644
  -741.57853089 -3866.29700959]
Mean Cross-Validation Score (neg MSE): -1154.4830343154595

Scaler: RobustScaler
Mean Squared Error: 548.270461372465
R^2 Score: 0.34430872531907697
Cross-Validation Scores (neg MSE): [-3091.38062289  -467.14810509  -454.0677629   -597.10026905
  -693.74061228  -493.65953369  -632.09562987  -507.84349434
  -741.57853089 -3866.29700959]
Mean Cross-Validation Score (neg MSE): -1154.4

Optimisation des paramètres

In [19]:
param_distributions = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'learning_rate': [0.3],
    'subsample': np.random.uniform(0.5, 1 ,2)
}

# Utiliser RandomizedSearchCV pour une recherche plus large
random_search = sk.model_selection.GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=param_distributions,
    #n_iter=20,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='neg_mean_squared_error',
    #random_state=42
)

random_search.fit(X_train_scaled, y_train)

# Obtenir le meilleur modèle
best_model = random_search.best_estimator_

Fitting 3 folds for each of 162 candidates, totalling 486 fits


In [20]:
y_pred_best = best_model.predict(X_test_scaled)
best_mse = mean_squared_error(y_test, y_pred_best)
best_r2 = r2_score(y_test, y_pred_best)

print(f"Best Mean Squared Error: {best_mse}")
print(f"Best R^2 Score: {best_r2}")

best_cv_scores = sk.model_selection.cross_val_score(best_model, X, Y, cv=5, scoring='neg_mean_squared_error')
print(f"Best Cross-Validation Scores (neg MSE): {best_cv_scores}")
print(f"Mean Best Cross-Validation Score (neg MSE): {best_cv_scores.mean()}")

Best Mean Squared Error: 405.44015152855957
Best R^2 Score: 0.5151233041132446
Best Cross-Validation Scores (neg MSE): [-10046.02595866  -3082.96237759  -2386.27707803  -1315.54942701
   -628.39703919]
Mean Best Cross-Validation Score (neg MSE): -3491.8423760968594


Testing Historic : GBM
* Test 1 : <br>
    * MSE: 502.90465800343037<br>
    * R^2 :  0.5024022334708622<br>
    * Cross-Validation Scores:  [-1581.82536208 -2903.98384436 -3354.44179264 -2584.99329781  -625.70739203]
    * Mean Cross-Validation Score: -2210.190337783685<br>
  
* Test 2 : with data deletion <br>
    Best Mean Squared Error: 405.44015152855957
    Best R^2 Score: 0.5151233041132446
    Best Cross-Validation Scores (neg MSE): [-10046.02595866  -3082.96237759  -2386.27707803  -1315.54942701
   -628.39703919]
    Mean Best Cross-Validation Score (neg MSE): -3491.8423760968594


Le modèle est visiblement moins adapté