In [50]:
import os
import pandas as pd

pd.options.plotting.backend = 'plotly'
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, \
                             GradientBoostingRegressor

from Pélec_04_fonctions import reg_modelGrid, visuRMSEGrid, compareGridModels


In [51]:
write_data = True

if write_data is True:
    try:
        os.mkdir("./Figures/")
    except OSError as error:
        print(error)
    try:
        os.mkdir("./Tableaux/")
    except OSError as error:
        print(error)
else:
    print("""Visualisation uniquement dans le notebook
    pas de création de figures ni de tableaux""")


[Errno 17] File exists: './Figures/'
[Errno 17] File exists: './Tableaux/'


In [52]:
BEBNum = pd.read_csv('BEBNum.csv')

BEBNumM = BEBNum.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBNum.TotalGHGEmissions).reshape(-1, 1)

BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train, TotalGHGEmissions_test = train_test_split(
    BEBNumM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


In [53]:
# Scaler moins sensible aux outlier d'après la doc
scaler = RobustScaler(quantile_range=(10, 90))


# 1. Modèle de prédiction sur les émissions (TotalGHGEmissions)
## 1.1 Avec les données numériques uniquement
### 1.1.1 Émissions brutes

#### 1.1.1.1 Modèle LinearRegression

In [54]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train)

TotalGHGEmissions_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(TotalGHGEmissions_test, TotalGHGEmissions_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(TotalGHGEmissions_test,
                                    TotalGHGEmissions_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_predLR.squeeze(),
    y=TotalGHGEmissions_test.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_predLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()


r2 : 0.4146166994852526
rmse : 438.4765696853244


#### 1.1.1.2 Comparaison de différents modèles sur les émissions brutes

In [55]:
paramlistEmissions = [{
    'ridge__alpha': np.logspace(1, 5, 100)
}, {
    'lasso__alpha': np.logspace(1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(0, 3, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train,
                                    TotalGHGEmissions_test,
                                    'TotalGHGEmissions', paramlistEmissions,
                                    score, write_data, 'Emissions')


      paramètre      Ridge()
0  ridge__alpha  2420.128265
               R²        RMSE         MAE      MAE%  FitTime(s)
Ridge()  0.240059  499.592779  150.208727  6.008821    1.612584


      paramètre    Lasso()
0  lasso__alpha  84.975344
               R²        RMSE         MAE      MAE%  FitTime(s)
Lasso()  0.284396  484.799883  148.842087  5.377628    1.659963


              paramètre  ElasticNet()
0     elasticnet__alpha       86.9749
1  elasticnet__l1_ratio        1.0000
                    R²        RMSE         MAE      MAE%  FitTime(s)
ElasticNet()  0.283837  484.989191  148.903674  5.399393    9.643551


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      5
                             R²        RMSE        MAE      MAE%  FitTime(s)
KNeighborsRegressor()  0.435979  430.401429  124.66919  2.964036    0.760583


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     215
1  randomforestregressor__max_features                    sqrt
                               R²        RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.658131  335.085852  84.134856  1.682706   43.441333


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   3
1          adaboostregressor__loss         exponential
                           R²        RMSE         MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.459726  421.243493  144.901397  5.333968    7.961578


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        3162
1          gradientboostingregressor__loss                       huber
                                   R²        RMSE        MAE      MAE%  \
GradientBoostingRegressor()  0.289153  483.185849  80.406485  1.238636   

                             FitTime(s)  
GradientBoostingRegressor()  290.032203  


### 1.1.2 Émissions au log

In [56]:
TotalGHGEmissions_train_log = np.log(TotalGHGEmissions_train)
TotalGHGEmissions_test_log = np.log(TotalGHGEmissions_test)


#### 1.1.2.1 Modèle LinearRegression

In [57]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train_log)

TotalGHGEmissions_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(TotalGHGEmissions_test_log,
                            TotalGHGEmissions_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(TotalGHGEmissions_test_log,
                                        TotalGHGEmissions_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_pred_logLR.squeeze(),
    y=TotalGHGEmissions_test_log.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_pred_logLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test_log=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()

r2 : 0.4146166994852526
rmse : 438.4765696853244


#### 1.1.2.2 Comparaison des modèles sur les émissions au log

In [58]:
paramlistEmissions_log = [{
    'ridge__alpha': np.logspace(3, 5, 100)
}, {
    'lasso__alpha': np.logspace(-2, 0, 100)
}, {
    'elasticnet__alpha': np.logspace(-1, 1, 10),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(3, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train_log,
                                        TotalGHGEmissions_test_log,
                                        'TotalGHGEmissions_log',
                                        paramlistEmissions_log, score,
                                        write_data, 'Emissions', '_log')


      paramètre      Ridge()
0  ridge__alpha  5590.810183
               R²        RMSE         MAE      MAE%  FitTime(s)
Ridge()  0.174313  578.897735  130.845493  2.322326    1.492748


      paramètre   Lasso()
0  lasso__alpha  0.312572
               R²        RMSE         MAE    MAE%  FitTime(s)
Lasso()  0.141368  573.351293  129.818125  2.4286    1.614154


              paramètre  ElasticNet()
0     elasticnet__alpha       1.29155
1  elasticnet__l1_ratio       0.10000
                   R²        RMSE         MAE      MAE%  FitTime(s)
ElasticNet()  0.16403  573.991775  129.854984  2.350116    0.849378


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²        RMSE        MAE      MAE%  FitTime(s)
KNeighborsRegressor()  0.630331  588.693105  77.713396  0.689444    0.730538


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                    1000
1  randomforestregressor__max_features                    log2
                               R²        RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.712429  455.410308  80.984653  0.668744   37.086066


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  12
1          adaboostregressor__loss         exponential
                           R²       RMSE         MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.326553  473.73503  119.147633  1.434593    7.989286


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        3162
1          gradientboostingregressor__loss                       huber
                                   R²        RMSE        MAE    MAE%  \
GradientBoostingRegressor()  0.700418  377.264666  63.273009  0.6509   

                             FitTime(s)  
GradientBoostingRegressor()  439.731457  


In [59]:
EmissionsScores = pd.DataFrame().append(
    [val for key, val in ResultEmissions.items() if key.startswith('Score')])


In [60]:
EmissionsScoresLog = pd.DataFrame().append([
    val for key, val in ResultEmissions_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [61]:
EmissionsCompareScores = EmissionsScores.append(EmissionsScoresLog)
if write_data is True:
    EmissionsCompareScores.to_latex('./Tableaux/EmissionsScoresModèles.tex')
EmissionsCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.240059,499.592779,150.208727,6.008821,1.612584
Lasso(),0.284396,484.799883,148.842087,5.377628,1.659963
ElasticNet(),0.283837,484.989191,148.903674,5.399393,9.643551
KNeighborsRegressor(),0.435979,430.401429,124.66919,2.964036,0.760583
RandomForestRegressor(),0.658131,335.085852,84.134856,1.682706,43.441333
AdaBoostRegressor(),0.459726,421.243493,144.901397,5.333968,7.961578
GradientBoostingRegressor(),0.289153,483.185849,80.406485,1.238636,290.032203
Ridge()_log,0.174313,578.897735,130.845493,2.322326,1.492748
Lasso()_log,0.141368,573.351293,129.818125,2.4286,1.614154
ElasticNet()_log,0.16403,573.991775,129.854984,2.350116,0.849378


In [62]:
fig = make_subplots(len(EmissionsScores.columns),
                    2,
                    column_titles=("Émissions brutes", "Émissions log"),
                    row_titles=(EmissionsScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(EmissionsScores):
    fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(title_text="Comparaison des scores des modèles d'émissions",
                  showlegend=False,
                  height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScores.pdf', height=700)


Afin de voir si l'energy star score permet d'améliorer le modèle nous allons
voir si le meilleurs modèle est amélioré avec cette variable.
Je choisi d'utiliser le modèle RandomForestRegressor avec la variable brute
car c'est le modèle ayant le rapport erreur / temps de calcul le plus intéressant

In [77]:
BEBESSNum = pd.read_csv('BEBESSNum.csv')

BEBESSNumM = BEBESSNum.drop(
    columns={'SiteEnergyUse(kBtu)', 'TotalGHGEmissions'})
SiteEnergyUseESS = np.array(BEBESSNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissionsESS = np.array(BEBESSNum.TotalGHGEmissions).reshape(-1, 1)

BEBESSNumM_train, BEBESSNumM_test, TotalGHGEmissionsESS_train, TotalGHGEmissionsESS_test = train_test_split(
    BEBESSNumM, TotalGHGEmissionsESS, test_size=.2)


In [78]:
BestParamEmissionsRF = ResultEmissions[
    'BestParamRandomForestRegressor'].set_index('paramètre')
paramlistEmissionsESS = [{
    'randomforestregressor__n_estimators': [
        int(BestParamEmissionsRF.
            loc['randomforestregressor__n_estimators'].values)
    ],
    'randomforestregressor__max_features': [
        *BestParamEmissionsRF.loc[
            'randomforestregressor__max_features', :].values
    ]
}]
ResultEmissionsESS = compareGridModels([RandomForestRegressor()],
                                           scaler,
                                           BEBESSNumM_train,
                                           BEBESSNumM_test,
                                           TotalGHGEmissionsESS_train,
                                           TotalGHGEmissionsESS_test,
                                           'TotalGHGEmissionsESS',
                                           paramlistEmissionsESS,
                                           score,
                                           write_data=write_data,
                                           prefix='EmissionsESS',
                                           suffix='_ESS',
                                           plotfigRMSE=False)


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     215
1  randomforestregressor__max_features                    sqrt
                               R²        RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.833797  278.067684  84.239689  1.316669    0.761941


In [79]:
EmissionsScoresESS = pd.DataFrame().append([
    val for key, val in ResultEmissionsESS.items()
    if key.startswith('Score')
]).rename('{}_ESS'.format)
CompareScoresESS = EmissionsScores.append(EmissionsScoresESS).drop(
    columns=('FitTime(s)')).loc[[
        'RandomForestRegressor()', 'RandomForestRegressor()_ESS'
    ]]


In [80]:
fig = make_subplots(1,
                    len(CompareScoresESS.columns),
                    column_titles=(CompareScoresESS.columns.to_list()))
for c, col in enumerate(CompareScoresESS.columns):
    fig.add_trace(go.Bar(x=CompareScoresESS.index, y=CompareScoresESS[col]),
                  row=1,
                  col=c + 1)
fig.update_layout(
    title_text="Comparaison avec et sans ajout de l'energy score stars",
    showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScoresESS.pdf')


# 2. Modèle de prédiction sur la consommation énergétique (SiteEnergyUse)
## 2.1 Avec les données numériques uniquement
### 2.1.1 Consommation énergétique brute

In [67]:
BEBNumM_train, BEBNumM_test, SiteEnergyUse_train, SiteEnergyUse_test = train_test_split(
    BEBNumM, SiteEnergyUse, test_size=.2)


#### 2.1.1.1 Modèle LinearRegression

In [68]:
#modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train)

SiteEnergyUse_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(SiteEnergyUse_test, SiteEnergyUse_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(SiteEnergyUse_test,
                                    SiteEnergyUse_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_predLR.squeeze(),
    y=SiteEnergyUse_test.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_predLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.4015468209143027
rmse : 8996830.185284536


#### 2.1.1.2 Comparaison des modèles sur la consommation

In [69]:
paramlistConso = [{
    'ridge__alpha': np.logspace(-3, 5, 100)
}, {
    'lasso__alpha': np.logspace(0.1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 3, 200),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 3, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train,
                                SiteEnergyUse_test, 'SiteEnergyUse',
                                paramlistConso, score, write_data, 'Conso')


      paramètre     Ridge()
0  ridge__alpha  453.487851
               R²          RMSE           MAE     MAE%  FitTime(s)
Ridge()  0.553741  7.769051e+06  3.995052e+06  2.56022      1.4399


      paramètre  Lasso()
0  lasso__alpha   1000.0
               R²          RMSE           MAE      MAE%  FitTime(s)
Lasso()  0.402305  8.991128e+06  4.454530e+06  2.589305    2.114775


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha      0.258262
1  elasticnet__l1_ratio      0.100000
                    R²          RMSE           MAE      MAE%  FitTime(s)
ElasticNet()  0.560727  7.708001e+06  3.987924e+06  2.584162    30.30994


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      3
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.581117  7.526982e+06  3.529971e+06  1.867113   

                       FitTime(s)  
KNeighborsRegressor()    0.780642  


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     100
1  randomforestregressor__max_features                    log2
                             R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.6108  7.255391e+06  2.598061e+06  1.316996   

                         FitTime(s)  
RandomForestRegressor()   38.388691  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   6
1          adaboostregressor__loss              square
                           R²          RMSE           MAE    MAE%  FitTime(s)
AdaBoostRegressor()  0.314499  9.628939e+06  5.797225e+06  5.1992    7.691021


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1000
1          gradientboostingregressor__loss               squared_error
                                   R²          RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.529428  7.977880e+06  2.620681e+06  1.439122   

                             FitTime(s)  
GradientBoostingRegressor()   47.078152  


### 2.1.2 Consommation énergétique au log

In [70]:
SiteEnergyUse_train_log = np.log(SiteEnergyUse_train)
SiteEnergyUse_test_log = np.log(SiteEnergyUse_test)


#### 2.1.2.1 Modèle LinearRegression

In [71]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train_log)

SiteEnergyUse_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(SiteEnergyUse_test_log, SiteEnergyUse_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(SiteEnergyUse_test_log,
                                        SiteEnergyUse_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_pred_logLR.squeeze(),
    y=SiteEnergyUse_test_log.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_pred_logLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test_log=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.4015468209143027
rmse : 8996830.185284536


#### 2.1.2.2 Comparaison des modèles sur la consommation au log

In [72]:
paramlistConso_log = [{
    'ridge__alpha': np.logspace(1, 4, 100)
}, {
    'lasso__alpha': np.logspace(-3, 0, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 1, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(1, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train_log,
                                    SiteEnergyUse_test_log,
                                    'SiteEnergyUse_log', paramlistConso_log,
                                    score, write_data, 'Conso', '_log')


      paramètre      Ridge()
0  ridge__alpha  4037.017259
               R²          RMSE           MAE      MAE%  FitTime(s)
Ridge()  0.278128  2.916855e+07  5.687962e+06  1.713915    1.527384


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


      paramètre   Lasso()
0  lasso__alpha  0.132194
               R²          RMSE           MAE      MAE%  FitTime(s)
Lasso()  0.294138  5.359543e+07  6.719967e+06  1.682276    2.339123


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


              paramètre  ElasticNet()
0     elasticnet__alpha      0.890215
1  elasticnet__l1_ratio      0.100000
                    R²          RMSE           MAE     MAE%  FitTime(s)
ElasticNet()  0.280229  2.826691e+07  5.637489e+06  1.71525   13.486726


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.741688  7.027762e+06  1.937477e+06  0.563715   

                       FitTime(s)  
KNeighborsRegressor()     0.79535  


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    log2
                               R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.763197  5.567256e+06  2.241155e+06  0.680038   

                         FitTime(s)  
RandomForestRegressor()   35.556921  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  23
1          adaboostregressor__loss              linear
                           R²          RMSE           MAE      MAE%  \
AdaBoostRegressor()  0.548577  8.636250e+06  3.734695e+06  1.207142   

                     FitTime(s)  
AdaBoostRegressor()    8.367555  


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1778
1          gradientboostingregressor__loss               squared_error
                                  R²          RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.75444  9.090850e+06  2.573856e+06  0.721289   

                             FitTime(s)  
GradientBoostingRegressor()  261.813331  


In [73]:
ConsoScores = pd.DataFrame().append(
    [val for key, val in ResultConso.items() if key.startswith('Score')])


In [74]:
ConsoScoresLog = pd.DataFrame().append([
    val for key, val in ResultConso_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [75]:
ConsoCompareScores = ConsoScores.append(ConsoScoresLog)
if write_data is True:
    ConsoCompareScores.to_latex('./Tableaux/ConsoScoresModèles.tex')
ConsoCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.553741,7769051.0,3995052.0,2.56022,1.4399
Lasso(),0.402305,8991128.0,4454530.0,2.589305,2.114775
ElasticNet(),0.560727,7708001.0,3987924.0,2.584162,30.30994
KNeighborsRegressor(),0.581117,7526982.0,3529971.0,1.867113,0.780642
RandomForestRegressor(),0.6108,7255391.0,2598061.0,1.316996,38.388691
AdaBoostRegressor(),0.314499,9628939.0,5797225.0,5.1992,7.691021
GradientBoostingRegressor(),0.529428,7977880.0,2620681.0,1.439122,47.078152
Ridge()_log,0.278128,29168550.0,5687962.0,1.713915,1.527384
Lasso()_log,0.294138,53595430.0,6719967.0,1.682276,2.339123
ElasticNet()_log,0.280229,28266910.0,5637489.0,1.71525,13.486726


In [76]:
fig = make_subplots(len(ConsoScores.columns),
                    2,
                    column_titles=("Consommation brute", "Consommation log"),
                    row_titles=(ConsoScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(ConsoScores):
    fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(
    title_text="Comparaison des scores des modèles de consommation",
    showlegend=False,
    height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoCompareScores.pdf', height=700)
