In [9]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score


## Random Forest

Collecte et traitement des données

In [54]:
# Collecte des données
X = pd.read_csv(r"X_train.csv",index_col=0)
Y = pd.read_csv(r"Y_train.csv",index_col=0)

# Traitement
## Missing Data
data =  pd.merge(X, Y, left_index=True, right_index=True)
data = data.dropna(axis=1, thresh=len(data)*0.25)
data = data.dropna()
#data = data.ffill()
X = data[[col for col in X.columns if col in data.columns]]
Y = data[Y.columns]
## Format
Y=Y.spot_id_delta.to_numpy()
X = pd.get_dummies(X)
## Split
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)

Creation et Validation du modèle

In [55]:
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]

for scaler in scalers:
    # Appliquer le scaler
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialiser le modèle
    model = RandomForestRegressor(random_state=42)
    
    # Entraîner le modèle de base
    model.fit(X_train_scaled, y_train)
    
    # Évaluation du modèle
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Scaler: {scaler.__class__.__name__}")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")
    
    # Validation croisée
    cv_scores = sk.model_selection.cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
    print(f"Cross-Validation Scores (neg MSE): {cv_scores}")
    print(f"Mean Cross-Validation Score (neg MSE): {cv_scores.mean()}\n")


Scaler: StandardScaler
Mean Squared Error: 322.92423921313986
R^2 Score: 0.6138062854872874
Cross-Validation Scores (neg MSE): [-2419.42128164  -291.31319477  -269.9040082   -354.7541268
  -436.65236134  -311.2060134   -433.18953457  -336.47817708
  -614.77029092 -3427.72849678]
Mean Cross-Validation Score (neg MSE): -889.5417485505026

Scaler: MinMaxScaler
Mean Squared Error: 322.83139408146707
R^2 Score: 0.6139173214577135
Cross-Validation Scores (neg MSE): [-2419.562608    -291.62883076  -270.04750281  -354.67475746
  -436.62250244  -311.28960137  -433.25686998  -336.3569342
  -614.73087811 -3427.96693788]
Mean Cross-Validation Score (neg MSE): -889.6137423017647

Scaler: RobustScaler
Mean Squared Error: 322.9058059042255
R^2 Score: 0.6138283303732883
Cross-Validation Scores (neg MSE): [-2419.50063777  -291.21519406  -270.0424435   -354.82418352
  -436.6973462   -311.24457436  -433.43981072  -336.54493901
  -614.89666856 -3427.4754746 ]
Mean Cross-Validation Score (neg MSE): -889.58

Optimisation des paramètres

In [12]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = sk.model_selection.GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [56]:
y_pred_best = best_model.predict(X_test_scaled)
best_mse = mean_squared_error(y_test, y_pred_best)
best_r2 = r2_score(y_test, y_pred_best)

print(f"Best Mean Squared Error: {best_mse}")
print(f"Best R^2 Score: {best_r2}")

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
best_cv_scores = sk.model_selection.cross_val_score(best_model, X, Y, cv=5, scoring='neg_mean_squared_error')
print(f"Best Cross-Validation Scores (neg MSE): {best_cv_scores}")
print(f"Mean Best Cross-Validation Score (neg MSE): {best_cv_scores.mean()}")

Best Mean Squared Error: 333.8829481740046
Best R^2 Score: 0.6007004730212658
Best Cross-Validation Scores (neg MSE): [-1079.36464117 -2868.8780258  -2224.27875941  -919.32166201
  -393.41790543]
Mean Best Cross-Validation Score (neg MSE): -1497.0521987641603


Testing Historic : Random Forest
* Test 1 : with scaling data (scaler standard) and random test_split <br>
    * MSE: 431.5021103227531<br>
    * R^2 : 0.5730513071769024<br>
    * Cross-Validation Scores: [-1080.1947177  -2755.16386623 -3312.69752171  -891.69760984  -406.95658577]<br>
    * Mean Cross-Validation Score: -1689.3420602516937<br>

    * GridSearchCV<br>
        * Best MSE: 443.8337272392575<br>
        * Best R^2: 0.5608498194044355<br>
        * Best Cross-Validation Scores (neg MSE): [-1019.10192921 -2746.71964383 -3328.72865908  -863.18619476  -391.44882472]<br>
        * Mean Best Cross-Validation Score (neg MSE): -1669.83705032049<br>

    * RandomizedSearchCV<br>
        * Best MSE : 475.3314485861939<br>
        * Best R^2 : 0.5296844771401255<br>
        * Best Cross-Validation Scores (neg MSE): [ -815.55988677  -558.34524056 -4556.09133445  -717.02674416  -4689.81266826 -3422.67064047  -702.36233955 -1033.57637428  -283.52683123  -442.0453057 ]
        * Mean Best Cross-Validation Score (neg MSE): -1722.1017365436826

* Test 2 : without scaling and with random split<br>
        Results are pretty similar so we are going to try different scaler.

* Test 3 : with MinMaxScaler and random split <br>
        Results are pretty similar so we are going to try different scaler.

* Test 4 : with RobustScaler and random split <br>
        Results are pretty similar so we are going to try diffrent split
    
* Test 4 : with RobustScaler and without random split <br>
        Way Worse

* Test 5 : delete na data for 60% columns null better <br>
    * Mean Squared Error: 322.83139408146707 <br>
    * *R^2 Score: 0.6139173214577135* <br>
    * Cross-Validation Scores (neg MSE): [-2419.562608    -291.62883076  -270.04750281  -354.67475746  -436.62250244  -311.28960137  -433.25686998  -336.3569342  -614.73087811 -3427.96693788] <br>
    * Mean Cross-Validation Score (neg MSE): -889.6137423017647<br>

* Test 6 : PCA 
        Worse
        

Le modèle n'est visiblement pas vraiment adapté
