# Prédicition de la variable de la consommation d'énergie des bâtiments de SEATTLE

In [1]:
from pycaret.regression import *

import numpy as np
import pandas as pd
from pathlib import Path
import timeit

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import FETCH_LOAD_DATAS
from TUNING_SUPERVISED_MODELS import *

##### Paramètres généraux d'affichage et fonction de sauvegarde des visualisations

In [2]:
SEED = 49

In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

plt.rcParams['figure.figsize'] = [10, 8]

In [4]:
IMAGES_PATH = Path() / "img" / "prediction_conso"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)
ID_FIG = 0

def save_fig(fig_title, resolution=300):
    fig_id = str(globals()['ID_FIG'])
    path = IMAGES_PATH / f"prediction_conso - {fig_id} - {fig_title}.png"
    globals()['ID_FIG']+=1
    plt.savefig(path, format='png', dpi=resolution)

##### Chargements des données et initialisation des variables utiles

In [5]:
data = pd.read_csv("datasets/df_EDA.csv")
df = data.copy()

Nous allons dans un premier temps chercher à optimiser un modèle nous permettant d'obtenir les meilleures performances possibles pour la prédiction de notre valeur cible de consomation d'énergie, sans la variable **ENRGYSTARScore**, puis nous verrons en la rajoutant à notre jeu de données si elle permet d'atteindre de meilleurs résulats.

In [6]:
df.head()

Unnamed: 0,LargestPropertyUseType,Neighborhood,PrimaryPropertyType,SecondLargestPropertyUseType,ThirdLargestPropertyUseType,LargestPropertyUseTypeGFA,PropertyGFATotal,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseTypeGFA,ENERGYSTARScore,Latitude,Longitude,NumberofBuildings,NumberofFloors,PropertyGFABuilding(s),PropertyGFAParking,YearBuilt,SiteEnergyUse(kBtu),TotalGHGEmissions
0,Hotel,DOWNTOWN,Hotel,,,88434.0,88434,0.0,0.0,60.0,47.6122,-122.33799,1.0,12,88434,0,1927,7226362.5,249.98
1,Hotel,DOWNTOWN,Hotel,Parking,Restaurant,83880.0,103566,15064.0,4622.0,61.0,47.61317,-122.33393,1.0,11,88502,15064,1996,8387933.0,295.86
2,Hotel,DOWNTOWN,Hotel,,,756493.0,956110,0.0,0.0,43.0,47.61393,-122.3381,1.0,41,759392,196718,1969,72587024.0,2089.28
3,Hotel,DOWNTOWN,Hotel,,,61320.0,61320,0.0,0.0,56.0,47.61412,-122.33664,1.0,10,61320,0,1926,6794584.0,286.43
4,Hotel,DOWNTOWN,Hotel,Parking,Swimming Pool,123445.0,175580,68009.0,0.0,75.0,47.61375,-122.34047,1.0,18,113580,62000,1980,14172606.0,505.01


In [7]:
df.drop(columns='ENERGYSTARScore', inplace=True)
df.drop(columns='TotalGHGEmissions', inplace=True)

In [8]:
conso = 'SiteEnergyUse(kBtu)'

cat_features = ['LargestPropertyUseType', 'Neighborhood', 'PrimaryPropertyType',
                'SecondLargestPropertyUseType', 'ThirdLargestPropertyUseType']

num_features = ['LargestPropertyUseTypeGFA', 'Latitude', 'Longitude', 'NumberofBuildings', 
                     'NumberofFloors', 'PropertyGFABuilding(s)', 'PropertyGFAParking', 'PropertyGFATotal',
                     'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA', 'YearBuilt']

log_features = ['LargestPropertyUseTypeGFA', 'PropertyGFATotal', 'SecondLargestPropertyUseTypeGFA', 
                'ThirdLargestPropertyUseTypeGFA']

std_features = ['Latitude', 'Longitude', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFABuilding(s)',
                'PropertyGFAParking', 'YearBuilt']

In [9]:
df = df[num_features+cat_features+[conso]]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=conso),df[conso], test_size=0.2, random_state=SEED)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [11]:
cat_features_ind = [X_train.columns.get_loc(ft) for ft in cat_features]
numerical_features_ind = [X_train.columns.get_loc(ft) for ft in num_features]
log_features_ind = [X_train.columns.get_loc(ft) for ft in log_features]
std_features_ind = [X_train.columns.get_loc(ft) for ft in std_features]

In [12]:
k = X_train.shape[1]

# Première estimation de performance

### Métriques de performance

Pour notre problématiques, les métriques de performances auxquelles nous allons être attentifs sont la **RMSE** et la **RMSLE** plus que le score **R²**.

En effet ce dernier peut être interprété comme la proportion de variance dans le jeu de données que notre modèle entraîné est capable d'expliquer, et dépend donc beaucoup du jeu de données. Nous lui préférerons les mesures de performances plus neutres citées plus haut, ainsi que le **score R² ajusté** qui permet de quantifier la robustesse de notre modèle.

### Utilisation de la librairie PyCaret pour nos estimations de performance

Nous allons lancé une batterie de modèles standards avec leurs paramètres par défaut sur notre jeu de données brut afin d'avoir une **performance de base** à laquelle nous allons comparer la performance du modèle dont nous ajusterons les hyperparamètres, parallèlement à notre travail sur les variables (**feature engineering**).

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Pour cela, nous utilisons la librairie **PyCaret** (https://github.com/pycaret/pycaret) qui est la version Python de la librairie **Caret** pour **R** (**C**lassification **A**nd **RE**gression **T**raining) pour gagner du temps dans l'étape fastidieuse d'instanciation et d'entraînement des modèles et de présentation des métriques de performance.

Voici les caractéristiques des entraînements de modèles réalisés par **PyCaret**:

   1. **OneHotEncoding** sur les variables catégorielles
   2. Les variables numériques sont normalisées (les SVM et KNN sont sensibles aux différences d'échelle)
   3. Aucune sélection de variable ni élimination de la multicollinéarité entre elles
   4. Les résultats sont une moyenne suite à une validation croisée simple à 10 passes (K-Fold)

In [13]:
df_train = X_train.copy()
df_train['TARGET'] = y_train

In [14]:
s = setup(df_train, target = 'TARGET', normalize=True, session_id=SEED, silent=True)

best_models = compare_models(sort='RMSE', exclude=['lar', 'llar'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,3692303.6751,149621325169401.3,10367298.4522,0.4831,1.198,1.1681,0.301
gbr,Gradient Boosting Regressor,4051087.3876,144205840823202.2,10487567.5453,0.5805,1.2524,1.5994,0.095
xgboost,Extreme Gradient Boosting,4509886.45,199067247522611.2,12388643.95,0.4462,1.2481,1.2071,0.223
rf,Random Forest Regressor,4216498.1657,196164869145028.7,12522747.7166,0.3874,1.2153,1.2965,0.277
omp,Orthogonal Matching Pursuit,4573155.4266,196635088055412.25,12676576.1767,0.0885,1.3192,2.095,0.007
ridge,Ridge Regression,5281996.15,211075730086297.6,12867046.45,0.0534,1.549,2.1193,0.009
lightgbm,Light Gradient Boosting Machine,5130933.0208,230360453270760.16,13299335.7723,0.4931,1.3734,1.6178,0.046
lasso,Lasso Regression,5597505.5,221229802428825.6,13596609.6,-0.0465,1.5379,2.1958,0.025
knn,K Neighbors Regressor,4556172.825,257767993507840.0,13988691.4,0.4533,1.3157,1.4313,0.015
en,Elastic Net,5055171.575,258398032284876.8,14269313.5,0.3332,1.4013,2.2118,0.023


# Traitement des valeurs aberrantes

Nous choisissons d'utiliser le critère de l'**inter-quartile** pour traiter les valeurs aberrantes, car nous n'avons pas besoins de faire d'hypothèses sur la distribution des variables.

Nous allons pour cela créer une fonction qui fera partie du processus global de pré-traitement des données avant l'évaluation de modèles pouvant potentiellement répondre à notre problématique, et nous pourrons ainsi observer l'impact de la présence de cette étape de traitement sur les performances des modèles (d'autant plus que cette étape nous prive d'un nombre important d'observations, **plus de 55%**, voir ci-après).

In [15]:
def remove_outliers_IR(data):
    df = data.copy()
    for col in data.select_dtypes(include=np.number).columns:
        Q3 = df.describe()[col]['75%']
        Q1 = df.describe()[col]['25%']
        IR = Q3 - Q1
        df.drop(index=df[df[col]>Q3+1.5*IR].index, inplace=True)
        df.drop(index=df[df[col]<Q1-1.5*IR].index, inplace=True)
    return df

In [16]:
df_train.shape[0], remove_outliers_IR(df_train).shape[0], f"{(df_train.shape[0]-remove_outliers_IR(df_train).shape[0])/df_train.shape[0]}%"

(1249, 523, '0.5812650120096077%')

# Évaluation des différents pré-traitements de données

Maintenant que nous avons une estimation de base de la performance que l'on peut attendre d'un modèle pour répondre à notre problématique, nous allons évaluer quelles combinaisons de transformations lors du pré-traitement des données permettent aux modèles standards (*paramètres par défaut*) d'atteindre les meilleures performances.

A l'issue de cette série d'évalutations, nous choisirons la combinaison de transformation des données donnant les meilleurs résulats et nous entraînerons spécifiquement (ajustement des hyperparamètres) les modèles les plus prometteurs dans cette configuration.

Nous allons ainsi comparer sur chaque combinaison de transformation de pré-traitement les **10 modèles** ayant obtenu les meilleurs résultats lors de la première estimation de performance, cela devrait nous donnner une diversité suffisante de modèles.

Afin de pouvoir plus simplement comparer nos modèles, nous définissons un score **Val_score** comme suit :

                                    Val_score = MAE*RMSE*RMSLE²/|adj. R²|
        
Nous donnons ainsi plus d'importance au score **RMSLE** (élévation au carré) car il pénalise plus les sous-estimations que les sur-estimations, ce que nous voulons car nous préférons que notre modèle surestime légèrement les prévisions (qui risquent d'être prises en compte pour définir des budgets futurs) plutôt qu'il sous-estime les consommations d'énergie des bâtiments, nous donnons la même importance aux scores **MAE**, **RMSE** et **R² ajusté**.

Ainsi, nous choisirons d'entraîner spécifiquement les 2 modèles les plus prometteurs dans leur configuration de pré-traitement leur ayant permis d'obtenir les scores **Val_score** les plus faibles.

In [17]:
from sklearn.linear_model import OrthogonalMatchingPursuit, ElasticNet, Ridge, HuberRegressor, Lasso
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.base import clone

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

from sklearn.model_selection import KFold

omp = OrthogonalMatchingPursuit(normalize=False)
el_net = ElasticNet(random_state=SEED)
ridge_reg = Ridge(random_state=SEED)
huber = HuberRegressor(max_iter=200)
las = Lasso(random_state=SEED)
gbr = GradientBoostingRegressor(random_state=SEED)
rfr = RandomForestRegressor(random_state=SEED)
hgbr = HistGradientBoostingRegressor(random_state=SEED)
xgb = XGBRegressor(random_state=SEED)
knr = KNeighborsRegressor()
lgb = LGBMRegressor(random_state=SEED)

models_to_train = {
                   'OrthoMatchingPursuit':omp,
                   'ElasticNet':el_net,
                   'Ridge':ridge_reg,
                   'Huber':huber,
                   'Lasso':las,
                   'GradientBoosting':gbr,
                   'RandomForest': rfr,
                   'HistGradientBoosting':hgbr,
                   'XGBoost':xgb,
                   'KNeighbors':knr,
                   'LightGBM':lgb
                  }
metrics = {
           'MAE':mean_absolute_error,
           'MSE':mean_squared_error,
           'RMSE':mean_squared_error,
           'adj. R²':r2_score,
           'RMSLE':mean_squared_log_error
          }

kfold = KFold(n_splits=10, shuffle=True, random_state=SEED)

## Valeur cible brute

Nous définissons une fonction qui va nous permettre de réaliser une validation croisée (10 passes) sur chacun de nos modèles et de mesurer sur chacune des passes les métriques de performances qui nous intéressent, puis de présenter dans un tableau synthétique la moyenne de ces métriques pour chaque modèle.

In [18]:
def cross_val_models_raw(X_train, y_train):

    df = pd.DataFrame(columns=['MAE', 'MSE', 'RMSE', 'adj. R²', 'RMSLE', 'TT(sec)'],
                      index=[model for model in models_to_train.keys()]
                     )
    df.index.name = 'Model'
    
    for mdl_name, model in models_to_train.items():
        
        dict_results = {
                       'MAE':[],
                       'MSE':[], 
                       'RMSE':[],
                       'adj. R²':[],
                       'RMSLE':[],
                       'TT(sec)':[]
                      }
        
        for train_index, test_index in kfold.split(X_train):
            mdl = clone(model)

            start_fit_time = timeit.default_timer()
            mdl.fit(X_train[train_index], y_train[train_index])
            fit_duration =  np.round(timeit.default_timer() - start_fit_time, 2)
            dict_results['TT(sec)'].append(fit_duration)

            y_pred_std = mdl.predict(X_train[test_index])

            for mtr_name, metric in metrics.items():
                score = metric(y_true=np.abs(y_train)[test_index], 
                               y_pred=np.abs((y_pred_std)))
                if mtr_name in ('RMSE','RMSLE'):
                    score = np.sqrt(score)
                elif mtr_name == 'adj. R²':
                    score = 1-((len(X_train)-1)/(len(X_train)-k-1))*(1-score)
                dict_results[mtr_name].append(score)
        
        for mtr_name, scores in dict_results.items():            
            df.loc[mdl_name, mtr_name] = np.round(np.mean(dict_results[mtr_name]),2)
            
        if df.loc[mdl_name, 'adj. R²'] != 0:
            df.loc[mdl_name, 'Val_score'] = np.round(df.loc[mdl_name, 'MAE']*df.loc[mdl_name, 'RMSE']*df.loc[mdl_name, 'RMSLE']**2/np.abs(df.loc[mdl_name, 'adj. R²']),2)
        else:
            df.loc[mdl_name, 'Val_score'] = np.nan
        
    return df.sort_values(by='Val_score', ascending=True)

#### Pré-traitement des données n°1
   - Passage aux logarithmes de certaines variables numériques pour rapprocher leur distribution d'une distribution normale
   - Normalisation des autres variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, OrdinalEncoder

In [38]:
def log_tr(X):
    return np.log2(X+1)

def inv_log_tr(Y):
    return (np.exp2(Y)-1)

cat_ft_pipeline = Pipeline([('one_hot_cat_ft', OneHotEncoder(sparse=False))])

ft_prepro_log = ColumnTransformer([('log_num_ft', FunctionTransformer(log_tr), log_features_ind),
                                   ('std_num_ft', StandardScaler(), std_features_ind),
                                   ('one_hot_cat_ft', cat_ft_pipeline, cat_features_ind)
                                  ])

X_train_tr_log = ft_prepro_log.fit_transform(X_train)

ohenc_col_names_log = ft_prepro_log.named_transformers_['one_hot_cat_ft']['one_hot_cat_ft'].get_feature_names()
tr_col_names_log = list(num_features) + list(ohenc_col_names_log)

df_ft_log_ft_raw = cross_val_models_raw(X_train_tr_log, y_train)
df_ft_log_ft_raw

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RandomForest,3504993.49,129387557445903.8,9886094.66,0.58,1.52,1.91,138029300000000.0
GradientBoosting,3657419.18,117345577536917.16,9794846.42,0.59,1.57,0.6,149664800000000.0
XGBoost,3752121.55,128375200728077.05,10258844.07,0.57,1.54,0.26,160155500000000.0
KNeighbors,3945954.71,188466453856828.8,12077210.89,0.5,1.54,0.04,226042500000000.0
HistGradientBoosting,4376547.68,170015552049591.2,11749372.48,0.49,1.6,6.31,268652100000000.0
LightGBM,4399146.79,174419634165351.06,11926423.51,0.46,1.61,0.14,295646400000000.0
Ridge,4277225.96,139506157401227.55,10819263.66,0.39,1.76,0.01,367553600000000.0
OrthoMatchingPursuit,4353843.21,148299460164733.2,11163883.27,0.4,1.81,0.01,398093600000000.0
Huber,4172158.89,257829079489291.1,14147464.31,0.35,1.63,0.22,448070800000000.0
ElasticNet,4836526.0,210814115608489.4,13010476.86,0.37,1.81,0.01,557162900000000.0


#### Pré-traitement des données n°2
   - Suppression des lignes comportant des valeurs aberrantes au sens du critère de l'inter-quartile
   - Passage aux logarithmes de certaines variables numériques pour rapprocher leur distribution d'une distribution normale
   - Normalisation des autres variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [39]:
X_train_wo_ouliers = remove_outliers_IR(X_train)
y_train_wo_outliers = y_train.loc[X_train_wo_ouliers.index]
y_train_wo_outliers.reset_index(drop=True, inplace=True)

ft_prepro_wo_outliers_log = ColumnTransformer([('log_num_ft', FunctionTransformer(log_tr), log_features_ind),
                                               ('std_num_ft', StandardScaler(), std_features_ind),
                                               ('one_hot_cat_ft', cat_ft_pipeline, cat_features_ind)
                                              ])

X_train_wo_outliers_tr_log = ft_prepro_wo_outliers_log.fit_transform(X_train_wo_ouliers)

df_ft_log_ft_raw_wo_out = cross_val_models_raw(X_train_wo_outliers_tr_log, y_train_wo_outliers)
df_ft_log_ft_raw_wo_out

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GradientBoosting,1277555.9,6616951707013.689,2360437.27,0.36,1.68,0.19,23642230000000.0
Ridge,1335056.32,6543133541514.64,2346897.27,0.36,1.77,0.0,27267020000000.0
Lasso,1359726.73,6600769725594.52,2368804.23,0.35,1.82,0.12,30482850000000.0
RandomForest,1326923.97,7099710743410.08,2467004.14,0.29,1.69,0.67,32239730000000.0
XGBoost,1364952.45,6823016577079.09,2448120.81,0.29,1.7,0.13,33300460000000.0
OrthoMatchingPursuit,1395628.75,6907956442420.71,2406476.89,0.33,1.84,0.0,34456670000000.0
KNeighbors,1396564.2,7505190465834.24,2549410.04,0.24,1.69,0.01,42370420000000.0
ElasticNet,1524862.45,7984969934759.741,2650402.81,0.18,1.78,0.01,71139380000000.0
LightGBM,1576719.24,7862613207435.97,2650229.08,0.16,1.74,0.07,79070830000000.0
Huber,1388019.02,8623394932519.619,2753412.31,0.12,1.7,0.15,92041410000000.0


#### Pré-traitement des données n°3
   - Normalisation de toutes les variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [40]:
ft_prepro_std = ColumnTransformer([('std_num_ft', StandardScaler(), numerical_features_ind),
                                   ('one_hot_cat_ft', cat_ft_pipeline, cat_features_ind)
                                  ])

X_train_tr_std = ft_prepro_std.fit_transform(X_train)

ohenc_col_names_std = ft_prepro_std.named_transformers_['one_hot_cat_ft']['one_hot_cat_ft'].get_feature_names()
tr_col_names_std = list(num_features) + list(ohenc_col_names_std)

df_ft_std_ft_raw = cross_val_models_raw(X_train_tr_std, y_train)
df_ft_std_ft_raw

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GradientBoosting,3608481.85,114093882164347.23,9600547.06,0.6,1.57,0.57,142320900000000.0
RandomForest,3529778.89,133386502044703.84,10082735.09,0.57,1.52,1.82,144257400000000.0
XGBoost,3791559.28,130987620059016.31,10431859.38,0.56,1.55,0.32,169689500000000.0
HistGradientBoosting,4376215.03,170012502422215.2,11749361.13,0.49,1.6,6.32,268631400000000.0
LightGBM,4457792.87,178310098108258.2,12090056.4,0.41,1.59,0.18,332321600000000.0
Huber,4017118.42,237733001394824.7,13639615.59,0.39,1.56,0.05,341901800000000.0
KNeighbors,4240068.83,213858561453318.44,13073535.61,0.4,1.6,0.04,354769200000000.0
Ridge,4254467.66,136234504236982.5,10511383.61,0.42,1.83,0.01,356580800000000.0
OrthoMatchingPursuit,4319982.97,147474128506737.56,10953802.85,0.42,1.8,0.01,365041900000000.0
ElasticNet,4504593.21,199413432700099.56,12657722.34,0.4,1.71,0.02,416815000000000.0


#### Pré-traitement des données n°4
   - Suppression des lignes comportant des valeurs aberrantes au sens du critère de l'inter-quartile
   - Normalisation de toutes les variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [41]:
X_train_tr_wo_outliers_std = ft_prepro_std.fit_transform(X_train_wo_ouliers)

df_ft_std_ft_raw_wo_out = cross_val_models_raw(X_train_tr_wo_outliers_std, y_train_wo_outliers)
df_ft_std_ft_raw_wo_out

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ridge,1330096.23,6496465314642.67,2339050.75,0.37,1.77,0.0,26343140000000.0
GradientBoosting,1297220.5,6838521709209.91,2415146.3,0.33,1.69,0.26,27115440000000.0
Lasso,1347636.1,6553872340246.9,2356224.79,0.36,1.79,0.14,28261350000000.0
OrthoMatchingPursuit,1395784.44,6802290744028.98,2390716.0,0.34,1.81,0.0,32153230000000.0
XGBoost,1364818.91,6782603403718.67,2441875.5,0.29,1.7,0.18,33212260000000.0
RandomForest,1338936.41,7229311869172.589,2496053.73,0.27,1.69,0.71,35352780000000.0
KNeighbors,1413695.63,7475786185775.699,2564125.03,0.22,1.7,0.01,47617900000000.0
ElasticNet,1508268.56,7920496777287.509,2640459.42,0.18,1.78,0.0,70101240000000.0
Huber,1385938.82,8530987052834.99,2737558.12,0.13,1.71,0.03,85340710000000.0
HistGradientBoosting,1628302.12,8078827536617.269,2684081.67,0.14,1.8,2.01,101145800000000.0


> Nous pouvons observer que le traitement des valeurs aberrantes réduit considérablement la **RMSE** (ratio 1:4 pour le modèle le plus performant).

## Valeur cible passée au logarithme

In [42]:
target_prepro_log = Pipeline([('target_log', FunctionTransformer(log_tr))])

y_train_tr_log = target_prepro_log.fit_transform(y_train)

y_train_wo_outliers_tr_log = target_prepro_log.fit_transform(y_train_wo_outliers)

In [43]:
def cross_val_models_log(X_train, outliers=False):
    if outliers:
        y_train = y_train_wo_outliers_tr_log
    else:
        y_train = y_train_tr_log
    
    df = pd.DataFrame(columns=['MAE', 'MSE', 'RMSE', 'adj. R²', 'RMSLE', 'TT(sec)'],
                      index=[model for model in models_to_train.keys()]
                     )
    df.index.name = 'Model'
    for mdl_name, model in models_to_train.items():
        
        dict_results = {
                       'MAE':[],
                       'MSE':[], 
                       'RMSE':[],
                       'adj. R²':[],
                       'RMSLE':[],
                       'TT(sec)':[]
                      }
        
        for train_index, test_index in kfold.split(X_train):
            mdl = clone(model)

            start_fit_time = timeit.default_timer()
            mdl.fit(X_train[train_index], y_train[train_index])
            fit_duration =  np.round(timeit.default_timer() - start_fit_time, 2)
            dict_results['TT(sec)'].append(fit_duration)

            y_pred_log = mdl.predict(X_train[test_index])

            for mtr_name, metric in metrics.items():
                score = metric(y_true=np.abs(inv_log_tr(y_train)[test_index]), y_pred=np.abs(inv_log_tr(y_pred_log)))
                if mtr_name in ('RMSE','RMSLE'):
                    score = np.sqrt(score)
                elif mtr_name == 'adj. R²':
                    score = 1-((len(X_train)-1)/(len(X_train)-k-1))*(1-score)
                dict_results[mtr_name].append(score)
        
        for mtr_name, scores in dict_results.items():            
            df.loc[mdl_name, mtr_name] = np.round(np.mean(dict_results[mtr_name]),2)
            
        if df.loc[mdl_name, 'adj. R²'] != 0:
            df.loc[mdl_name, 'Val_score'] = np.round(df.loc[mdl_name, 'MAE']*df.loc[mdl_name, 'RMSE']*df.loc[mdl_name, 'RMSLE']**2/np.abs(df.loc[mdl_name, 'adj. R²']),2)
        else:
            df.loc[mdl_name, 'Val_score'] = np.nan
        
    return df.sort_values(by='Val_score', ascending=True)

#### Pré-traitement des données n°5
   - Passage aux logarithmes de certaines variables numériques pour rapprocher leur distribution d'une distribution normale
   - Normalisation des autres variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [44]:
df_ft_log_tgt_log = cross_val_models_log(X_train_tr_log)
df_ft_log_tgt_log

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OrthoMatchingPursuit,15163927.58,1.4492160293089645e+17,134574532.36,-2130.35,1.57,0.01,2361146000000.0
Ridge,5894554.03,5940948376744959.0,34474991.68,-84.92,1.54,0.02,5675271000000.0
Huber,3255336.1,110343494289244.42,9663883.21,0.51,1.48,0.39,135114100000000.0
XGBoost,3519960.14,112400280427405.0,9627674.66,0.66,1.68,0.26,144921800000000.0
GradientBoosting,3641288.11,144296920448639.7,11037424.06,0.56,1.62,0.65,188349600000000.0
RandomForest,3669189.23,182630623293115.84,11688641.34,0.53,1.57,1.78,199460800000000.0
KNeighbors,4098441.98,276965793896177.1,14323271.36,0.35,1.59,0.04,424020900000000.0
LightGBM,4338932.47,241516943397011.6,13910861.91,0.34,1.57,0.14,437579900000000.0
HistGradientBoosting,4438992.81,241123420010373.2,14068185.09,0.32,1.58,7.06,487176900000000.0
ElasticNet,5018719.77,312624258815764.06,15942840.76,0.14,1.64,0.01,1537157000000000.0


#### Pré-traitement des données n°6
   - Suppression des lignes comportant des valeurs aberrantes au sens du critère de l'inter-quartile
   - Passage aux logarithmes de certaines variables numériques pour rapprocher leur distribution d'une distribution normale
   - Normalisation des autres variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [45]:
df_ft_log_tgt_log_wo_out = cross_val_models_log(X_train_wo_outliers_tr_log, outliers=True)
df_ft_log_tgt_log_wo_out

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Huber,1210052.74,6714513358949.7,2384562.66,0.33,1.65,0.21,23804930000000.0
XGBoost,1325451.78,7050705531345.91,2444482.7,0.29,1.78,0.13,35399160000000.0
RandomForest,1307812.56,7608260965615.299,2530460.6,0.26,1.7,0.61,36784900000000.0
GradientBoosting,1328798.22,7764012430170.679,2526437.97,0.26,1.81,0.24,42301080000000.0
Ridge,1427754.07,7991933013163.878,2626322.87,0.17,1.71,0.0,64497790000000.0
OrthoMatchingPursuit,1448024.66,8390557928049.92,2694885.13,0.14,1.71,0.0,81504280000000.0
Lasso,1634702.9,11048735787811.96,3165026.58,-0.19,1.76,0.0,84350550000000.0
ElasticNet,1635123.65,11070185919562.66,3168510.47,-0.19,1.76,0.0,84465140000000.0
LightGBM,1620409.16,10506275506203.06,3034312.85,-0.1,1.69,0.07,140429500000000.0
KNeighbors,1487022.21,8818784935862.32,2814340.59,0.06,1.81,0.01,228507300000000.0


#### Pré-traitement des données n°7
   - Normalisation de toutes les variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [46]:
df_ft_std_tgt_log = cross_val_models_log(X_train_tr_std)
df_ft_std_tgt_log

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OrthoMatchingPursuit,32034880.29,3.877791940057964e+17,303187132.68,-5126.4,1.67,0.01,5283897000000.0
Ridge,12116737.46,1.60452848875872e+16,82297541.66,-94.7,1.63,0.01,27976780000000.0
Huber,47548877.42,3.485398291463583e+17,345061581.26,-732.05,1.66,0.33,61760710000000.0
XGBoost,3538577.52,112834136418032.05,9622744.15,0.66,1.67,0.31,143885400000000.0
GradientBoosting,3637236.78,144815390287188.5,11065190.95,0.56,1.62,0.55,188613400000000.0
RandomForest,3697389.84,184467682232644.7,11840636.6,0.51,1.58,1.5,214296100000000.0
LightGBM,4353261.96,248918253605781.0,14059932.5,0.35,1.57,0.14,431051600000000.0
HistGradientBoosting,4440182.51,241133513341064.47,14068681.39,0.32,1.58,5.74,487324700000000.0
KNeighbors,4252065.88,279605128977805.6,14419548.92,0.35,1.7,0.04,506269100000000.0
Lasso,6176042.95,374728244548743.1,17800455.55,-0.12,1.88,0.01,3237993000000000.0


#### Pré-traitement des données n°8
   - Suppression des lignes comportant des valeurs aberrantes au sens du critère de l'inter-quartile
   - Normalisation de toutes les variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [47]:
df_ft_std_tgt_log_wo_out = cross_val_models_log(X_train_tr_wo_outliers_std, outliers=True)
df_ft_std_tgt_log_wo_out

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Huber,1256860.08,6969140161066.62,2407975.31,0.31,1.7,0.18,28214680000000.0
XGBoost,1322892.12,7194290530765.449,2455033.07,0.28,1.77,0.15,36338770000000.0
RandomForest,1311958.92,7573842586181.529,2531526.28,0.25,1.7,0.53,38393750000000.0
GradientBoosting,1324407.59,7744137210451.269,2528394.02,0.26,1.81,0.18,42193950000000.0
Ridge,1425786.28,7726057914547.18,2581827.02,0.2,1.71,0.0,53820010000000.0
OrthoMatchingPursuit,1437495.92,8285947262107.1,2667499.29,0.17,1.71,0.0,65955990000000.0
ElasticNet,1634702.9,11048735787811.96,3165026.58,-0.19,1.76,0.0,84350550000000.0
Lasso,1634702.9,11048735787811.96,3165026.58,-0.19,1.76,0.0,84350550000000.0
KNeighbors,1480956.68,8771194140562.719,2806548.57,0.07,1.8,0.01,192380900000000.0
LightGBM,1619739.33,9837777781258.947,2968357.09,-0.06,1.69,0.07,228867100000000.0


## Valeur cible normalisée

In [48]:
target_prepro_std = Pipeline([('target_std', StandardScaler())])

y_train_tr_std = target_prepro_std.fit_transform(pd.DataFrame(y_train))

y_train_tr_wo_outliers_std = target_prepro_std.fit_transform(pd.DataFrame(y_train_wo_outliers))

In [49]:
def cross_val_models_std(X_train, outliers=False):
    if outliers:
        y_train = y_train_tr_wo_outliers_std
    else:
        y_train = y_train_tr_std
    
    df = pd.DataFrame(columns=['MAE', 'MSE', 'RMSE', 'adj. R²', 'RMSLE', 'TT(sec)'],
                      index=[model for model in models_to_train.keys()]
                     )
    df.index.name = 'Model'
    for mdl_name, model in models_to_train.items():
        
        dict_results = {
                       'MAE':[],
                       'MSE':[], 
                       'RMSE':[],
                       'adj. R²':[],
                       'RMSLE':[],
                       'TT(sec)':[]
                      }
        
        for train_index, test_index in kfold.split(X_train):
            mdl = clone(model)

            start_fit_time = timeit.default_timer()
            mdl.fit(X_train[train_index], y_train[train_index])
            fit_duration =  np.round(timeit.default_timer() - start_fit_time, 2)
            dict_results['TT(sec)'].append(fit_duration)

            y_pred_std = mdl.predict(X_train[test_index])

            for mtr_name, metric in metrics.items():
                score = metric(y_true=np.abs(target_prepro_std.inverse_transform(y_train)[test_index]), 
                               y_pred=np.abs(target_prepro_std.inverse_transform(y_pred_std)))
                if mtr_name in ('RMSE','RMSLE'):
                    score = np.sqrt(score)
                elif mtr_name == 'adj. R²':
                    score = 1-((len(X_train)-1)/(len(X_train)-k-1))*(1-score)
                dict_results[mtr_name].append(score)
        
        for mtr_name, scores in dict_results.items():            
            df.loc[mdl_name, mtr_name] = np.round(np.mean(dict_results[mtr_name]),2)
        
        if df.loc[mdl_name, 'adj. R²'] != 0:
            df.loc[mdl_name, 'Val_score'] = np.round(df.loc[mdl_name, 'MAE']*df.loc[mdl_name, 'RMSE']*df.loc[mdl_name, 'RMSLE']**2/np.abs(df.loc[mdl_name, 'adj. R²']),2)
        else:
            df.loc[mdl_name, 'Val_score'] = np.nan
        
        
    return df.sort_values(by='Val_score', ascending=True)

#### Pré-traitement des données n°9
   - Passage aux logarithmes de certaines variables numériques pour rapprocher leur distribution d'une distribution normale
   - Normalisation des autres variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [50]:
df_ft_log_tgt_std = cross_val_models_std(X_train_tr_log)
df_ft_log_tgt_std

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RandomForest,587202.19,3677079026284.9697,1677728.63,0.59,0.26,2.76,112876600000.0
GradientBoosting,598430.93,3104661721502.6494,1590686.63,0.61,0.27,0.51,113761800000.0
XGBoost,625374.41,4348375734306.94,1842326.92,0.56,0.27,0.32,149984500000.0
Huber,618472.53,5311789284737.75,2022328.84,0.51,0.28,0.32,192272900000.0
KNeighbors,655339.22,5198310131996.54,2005768.07,0.5,0.29,0.04,221091900000.0
LightGBM,736869.39,4821866483294.38,1983519.4,0.45,0.32,0.17,332594000000.0
HistGradientBoosting,729468.5,4693681070609.68,1952316.88,0.49,0.36,7.22,376674100000.0
OrthoMatchingPursuit,750871.72,4108187867985.25,1859576.2,0.39,0.35,0.0,438582400000.0
Ridge,759067.37,3888455170393.5,1810575.92,0.38,0.37,0.0,495127300000.0
ElasticNet,1075223.92,8380870507864.83,2579209.78,0.21,0.45,0.0,2674184000000.0


#### Pré-traitement des données n°10
   - Suppression des lignes comportant des valeurs aberrantes au sens du critère de l'inter-quartile
   - Passage aux logarithmes de certaines variables numériques pour rapprocher leur distribution d'une distribution normale
   - Normalisation des autres variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [51]:
df_ft_log_tgt_std_wo_out = cross_val_models_std(X_train_wo_outliers_tr_log, outliers=True)
df_ft_log_tgt_std_wo_out

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GradientBoosting,1273955.85,6580584432838.77,2353358.06,0.37,1.68,0.19,22869630000000.0
Huber,1247744.85,6631021704822.51,2351088.38,0.37,1.73,0.21,23729320000000.0
Ridge,1335056.32,6543133541514.63,2346897.27,0.36,1.77,0.0,27267020000000.0
RandomForest,1330560.32,7161045988124.29,2472836.4,0.29,1.69,0.69,32404500000000.0
OrthoMatchingPursuit,1395628.75,6907956442420.71,2406476.89,0.33,1.84,0.0,34456670000000.0
XGBoost,1354807.6,6895748577456.52,2447939.83,0.29,1.74,0.15,34624130000000.0
KNeighbors,1396564.2,7505190465834.24,2549410.04,0.24,1.69,0.01,42370420000000.0
LightGBM,1576719.24,7862613220193.679,2650229.08,0.16,1.74,0.09,79070830000000.0
HistGradientBoosting,1628602.76,8081152429519.36,2684408.63,0.14,1.8,2.0,101176800000000.0
ElasticNet,1816233.04,9715579150849.088,2973188.5,-0.06,1.86,0.0,311364200000000.0


#### Pré-traitement des données n°11
   - Normalisation de toutes les variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [52]:
df_ft_std_tgt_std = cross_val_models_std(X_train_tr_std)
df_ft_std_tgt_std

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Huber,568530.49,3275450728198.6294,1613898.81,0.61,0.27,0.38,109654800000.0
GradientBoosting,597517.1,3138525984947.57,1591615.51,0.61,0.27,0.61,113654400000.0
RandomForest,591334.26,3663948555792.78,1682420.31,0.58,0.26,1.88,115954100000.0
XGBoost,631008.92,4424924287215.13,1872801.3,0.54,0.27,0.33,159536800000.0
OrthoMatchingPursuit,732918.61,4076661164044.58,1822001.8,0.42,0.33,0.01,346244700000.0
HistGradientBoosting,729413.25,4693596955329.12,1952315.05,0.49,0.36,5.79,376645200000.0
KNeighbors,704185.32,5898679070289.67,2171236.43,0.4,0.32,0.04,391411900000.0
LightGBM,747107.23,4930812794380.66,2012101.69,0.41,0.33,0.16,399279400000.0
Ridge,738186.13,3783793868201.96,1754645.92,0.41,0.36,0.0,409427000000.0
ElasticNet,1143847.07,8669412188728.08,2625186.0,0.17,0.48,0.01,4069693000000.0


#### Pré-traitement des données n°12
   - Suppression des lignes comportant des valeurs aberrantes au sens du critère de l'inter-quartile
   - Normalisation de toutes les variables numériques
   - Tableau disjonctif complet des variables catégorielles

In [53]:
df_ft_std_tgt_std_wo_out = cross_val_models_std(X_train_tr_wo_outliers_std, outliers=True)
df_ft_std_tgt_std_wo_out

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GradientBoosting,1291821.04,6761703756536.31,2397592.44,0.34,1.68,0.18,25710900000000.0
Huber,1268805.79,6766971227655.38,2369928.02,0.35,1.73,0.19,25713100000000.0
Ridge,1330096.23,6496465314642.68,2339050.75,0.37,1.77,0.0,26343140000000.0
XGBoost,1340619.37,6690373648755.97,2413023.43,0.31,1.74,0.15,31593940000000.0
OrthoMatchingPursuit,1395784.44,6802290744028.98,2390716.0,0.34,1.81,0.0,32153230000000.0
RandomForest,1333134.82,7150910405417.94,2479518.57,0.28,1.68,0.67,33319770000000.0
KNeighbors,1413695.63,7475786185775.699,2564125.03,0.22,1.7,0.01,47617900000000.0
HistGradientBoosting,1628302.12,8078827533152.11,2684081.67,0.14,1.8,2.0,101145800000000.0
LightGBM,1601732.77,8114402834851.13,2687012.24,0.14,1.83,0.08,102951800000000.0
ElasticNet,1816233.04,9715579150849.088,2973188.5,-0.06,1.86,0.0,311364200000000.0


### Conclusion de l'évalutation des différentes pré-traitement de données

Les meilleurs résultats sont obtenus dans la configuration où **la valeur cible est normalisée**.

Les 2 modèles les plus prometteurs sont alors :

   1. **RandomForest** avec :
       - Passage aux logarithmes de certaines variables numériques pour rapprocher leur distribution d'une distribution normale
       - Normalisation des autres variables numériques
       - Tableau disjonctif complet des variables catégorielles


   2. **GradientBoosting** avec :
       - Normalisation de toutes les variables numériques
       - Tableau disjonctif complet des variables catégorielles

Nous allons désormais ajutser les hyperparamètres de ces 2 modèles dans les configurations décrites précédemment pour obtenir les meilleures performances possibles.

# Entraînement des modèles prometteurs

In [54]:
from sklearn.model_selection import RandomizedSearchCV

In [55]:
def val_score(estimator, X_test, y_test):
    y_pred = estimator.predict(X_test)
    y_true=y_test.copy()
    v = mean_absolute_error(y_true, y_pred)*mean_squared_error(y_true, y_pred)*(mean_squared_log_error(np.abs(y_true), np.abs(y_pred))**2)/np.abs(r2_score(y_true, y_pred))
    return v

In [56]:
params_gbr = {'loss' : ['squared_error', 'absolute_error', 'huber'],
              'learning_rate': list((np.logspace(-2,1,6))),
              'n_estimators' : list(range(100,1000,300)),
              'max_depth' : list(range(1,10)),
              'min_samples_split' : list(range(1,1000,150))
}

params_rfr = {'n_estimators' : list(range(100,1000,300)),
              'max_depth' : list(range(1,1000,150)),
              'min_samples_split' : list(range(1,1000,150))
             }

In [64]:
gbr_search = RandomizedSearchCV(gbr, params_gbr, n_iter=50, scoring='neg_mean_squared_error', random_state=SEED)
rfr_search = RandomizedSearchCV(rfr, params_rfr, n_iter=50, scoring='neg_root_mean_squared_error', random_state=SEED)

In [65]:
gbr_search.fit(X_train_tr_std, y_train_tr_std)
gbr_search.best_params_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [59]:
from unittest.mock import patch
with patch("sklearn.utils.validation._assert_all_finite"):
    gbr_search.fit(X_train_tr_std, y_train_tr_std)

In [60]:
gbr_search.best_params_

{'n_estimators': 700,
 'min_samples_split': 751,
 'max_depth': 1,
 'loss': 'huber',
 'learning_rate': 0.15848931924611134}

In [61]:
best_gbr = GradientBoostingRegressor(**gbr_search.best_params_, random_state=SEED)

In [None]:
params_gbr = {'loss' : ['squared_error', 'absolute_error', 'huber'],
              'learning_rate': list((np.logspace(-1,0,6))),
              'n_estimators' : list(range(500,1000,100)),
              'max_depth' : list(range(1,10)),
              'min_samples_split' : list(range(300,500,20))
}
gbr_search = RandomizedSearchCV(gbr, params_gbr, n_iter=20, scoring='neg_root_mean_squared_error', random_state=SEED)
gbr_search.fit(X_train_tr_std, y_train_tr_std)

In [None]:
gbr_search.best_params_

In [None]:
params_gbr = {'loss' : ['huber'],
              'learning_rate': list((np.logspace(-1,0,10))),
              'n_estimators' : list(range(700,900,10)),
              'max_depth' : [1],
              'min_samples_split' : list(range(400,500,20))
}
gbr_search = RandomizedSearchCV(gbr, params_gbr, n_iter=20, scoring='neg_root_mean_squared_error', random_state=SEED)
gbr_search.fit(X_train_tr_std, y_train_tr_std)

In [None]:
gbr_search.best_params_

In [None]:
best_gbr = GradientBoostingRegressor(**{'n_estimators': 790,
                                        'min_samples_split': 460,
                                        'max_depth': 1,
                                        'loss': 'huber',
                                         'learning_rate': 0.5994842503189409
                                       },
                                     random_state=SEED)

In [62]:
models_to_train['best_gbr']=best_gbr

In [63]:
cross_val_models_std(X_train_tr_std)

Unnamed: 0_level_0,MAE,MSE,RMSE,adj. R²,RMSLE,TT(sec),Val_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Huber,568530.49,3275450728198.6294,1613898.81,0.61,0.27,0.51,109654800000.0
GradientBoosting,597517.1,3138525984947.57,1591615.51,0.61,0.27,0.65,113654400000.0
RandomForest,591334.26,3663948555792.78,1682420.31,0.58,0.26,2.19,115954100000.0
best_gbr,579157.88,3821637410166.32,1767200.02,0.56,0.26,2.38,123549600000.0
XGBoost,631008.92,4424924287215.13,1872801.3,0.54,0.27,0.42,159536800000.0
OrthoMatchingPursuit,732918.61,4076661164044.58,1822001.8,0.42,0.33,0.01,346244700000.0
HistGradientBoosting,729413.25,4693596955329.12,1952315.05,0.49,0.36,5.75,376645200000.0
KNeighbors,704185.32,5898679070289.67,2171236.43,0.4,0.32,0.05,391411900000.0
LightGBM,747107.23,4930812794380.66,2012101.69,0.41,0.33,0.22,399279400000.0
Ridge,738186.13,3783793868201.96,1754645.92,0.41,0.36,0.01,409427000000.0


In [None]:
plot_learning_curve(
        best_gbr,
        "learning_curves",
        X_train_tr_std,
        y_train_tr_std,
        #axes=axes[:,i],
        cv=kfold,
        n_jobs=4,
        scoring="neg_root_mean_squared_error",
        train_sizes=np.linspace(0.2, 1.0, 10)
    )

In [None]:
rfr_search.fit(X_train_tr_log, y_train_tr_std)

In [None]:
rfr_search.best_params_

In [None]:
rfr_search.best_score_

In [None]:
params_rfr = {'n_estimators' : list(range(600,1000,100)),
              'max_depth' : list(range(400,600,50)),
              'min_samples_split' : list(range(100,300,50))
             }
rfr_search = RandomizedSearchCV(rfr, params_rfr, n_iter=20, scoring='neg_root_mean_squared_error', random_state=SEED)
rfr_search.fit(X_train_tr_log, y_train_tr_std)

In [None]:
rfr_search.best_params_, rfr_search.best_score_

In [None]:
best_rfr = RandomForestRegressor(**rfr_search.best_params_, random_state=SEED)

In [None]:
models_to_train['best_rfr']=best_rfr

In [None]:
cross_val_models_std(X_train_tr_log)

In [None]:
plot_learning_curve(
        best_rfr,
        "learning_curves",
        X_train_tr_std,
        y_train_tr_std,
        #axes=axes[:,i],
        cv=kfold,
        n_jobs=4,
        scoring="neg_root_mean_squared_error",
        train_sizes=np.linspace(0.2, 1.0, 10)
    )

In [None]:
def learning_curves_models(X_train, y_train):
    for mdl_name, model in models_to_train.items():

        N, train_score, val_score = learning_curve(model, X_train, y_train,
                                                   cv=kfold, scoring='neg_root_mean_squared_error',
                                                   train_sizes=np.linspace(0.1, 1, 10))
        
        plt.suptitle(f"Courbes d'apprentissage de {mdl_name}")
        plt.plot(N, train_score.mean(axis=1), label='train score')
        plt.plot(N, val_score.mean(axis=1), label='validation score')
        plt.legend()
        plt.show()  

In [None]:
#gbr
learning_curves_models(X_train_tr_log, y_train_tr_std)

In [None]:
#rfr
learning_curves_models(X_train_tr_std, y_train_tr_std)

In [None]:
params_huber = {'epsilon':np.logspace(0,1,10),
                'max_iter':list(range(10,500,50)),
                'alpha':np.logspace(-3,1,10),
                'warm_start':[True,False]
}

hub_search = RandomizedSearchCV(huber, params_huber, n_iter=50, scoring='neg_root_mean_squared_error', random_state=SEED)

hub_search.fit(X_train_tr_std, y_train_tr_std)

hub_search.best_params_, hub_search.best_score_

In [None]:
params_huber = {'epsilon':np.logspace(0,1,20),
                'max_iter':list(range(10,500,10)),
                'alpha':np.logspace(-3,-2,10)
}

hub_search = RandomizedSearchCV(huber, params_huber, n_iter=50, scoring='neg_root_mean_squared_error', random_state=SEED)

hub_search.fit(X_train_tr_std, y_train_tr_std)

hub_search.best_params_, hub_search.best_score_

In [None]:
best_hub = HuberRegressor(**hub_search.best_params_)

In [None]:
models_to_train['best_hub'] = best_hub

In [None]:
cross_val_models_std(X_train_tr_std)

In [None]:
plot_learning_curve(
        best_hub,
        "learning_curves",
        X_train_tr_std,
        y_train_tr_std,
        #axes=axes[:,i],
        cv=kfold,
        n_jobs=4,
        scoring="neg_root_mean_squared_error",
        train_sizes=np.linspace(0.2, 1.0, 10)
    )

In [None]:
#rfr, huber
learning_curves_models(X_train_tr_std, y_train_tr_std)

In [None]:
opt_rfr = clone(best_rfr)

In [None]:
opt_rfr.fit(X_train_tr_std, y_train_tr_std)

In [None]:
from sklearn.inspection import permutation_importance
def plot_features_importance(estimator, name_model, X_train, y_train, scoring=None):

    results = permutation_importance(estimator, X_train, y_train, scoring=scoring)
    
    df_importance = pd.DataFrame({
                        "Feature" : X_train.columns,
                        "Importance" : results.importances_mean
                    })
    
    df_importance = df_importance.sort_values("Importance").iloc[:20,:]
      
    fig = plt.subplots(figsize=(10, 8))
    
    plot = sns.barplot(data=df_importance, y=df_importance["Feature"], x=df_importance["Importance"])
    
    plt.title(name_model + " Features Importance", fontdict={ "fontsize": 16, "fontweight": "normal" })
    plt.xlabel("Importance")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.savefig("img/" + name_model + "-feature-importance.png")
    plt.show()

In [None]:
opt_gbr = clone(best_gbr)
opt_gbr.fit(X_train_tr_log, y_train_tr_std)
plot_features_importance(opt_gbr, "gradient_boosting", pd.DataFrame(X_train_tr_log, columns=tr_col_names), y_train_tr_std, scoring='neg_root_mean_squared_error')

In [None]:
sns.histplot(X_train_tr_std[:, np.argmax(opt_rfr.feature_importances_)])

In [None]:
for col in df.drop(columns=conso).select_dtypes(include=np.number).columns:
    plt.figure()
    plt.title(col)
    sns.histplot(StandardScaler().fit_transform(pd.DataFrame(df[col])))

In [None]:
feature_importance = opt_rfr.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(X_train_tr_std.columns)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    reg, ft_prepro_std.fit_transform(X_test), target_prepro_std.fit_transform(y_test), n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(X_train_tr_std.columns)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()