In [1]:
import os
import pandas as pd

pd.options.plotting.backend = 'plotly'
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, \
                             GradientBoostingRegressor

from Pélec_04_fonctions import reg_modelGrid, visuRMSEGrid, compareGridModels


In [2]:
write_data = True

if write_data is True:
    try:
        os.mkdir("./Figures/")
    except OSError as error:
        print(error)
    try:
        os.mkdir("./Tableaux/")
    except OSError as error:
        print(error)
else:
    print("""Visualisation uniquement dans le notebook
    pas de création de figures ni de tableaux""")


[Errno 17] File exists: './Figures/'
[Errno 17] File exists: './Tableaux/'


In [3]:
BEBNum = pd.read_csv('BEBNum.csv')

BEBNumM = BEBNum.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBNum.TotalGHGEmissions).reshape(-1, 1)

BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train, TotalGHGEmissions_test = train_test_split(
    BEBNumM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


In [4]:
# Scaler moins sensible aux outlier d'après la doc
scaler = RobustScaler(quantile_range=(10, 90))


# 1. Modèle de prédiction sur les émissions (TotalGHGEmissions)
## 1.1 Avec les données numériques uniquement
### 1.1.1 Émissions brutes

#### 1.1.1.1 Modèle LinearRegression

In [5]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train)

TotalGHGEmissions_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(TotalGHGEmissions_test, TotalGHGEmissions_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(TotalGHGEmissions_test,
                                    TotalGHGEmissions_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_predLR.squeeze(),
    y=TotalGHGEmissions_test.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_predLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsLR.pdf')


r2 : 0.37316476657598696
rmse : 717.3827369672941


#### 1.1.1.2 Comparaison de différents modèles sur les émissions brutes

In [6]:
paramlistEmissions = [{
    'ridge__alpha': np.logspace(1, 5, 100)
}, {
    'lasso__alpha': np.logspace(1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(0, 3, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train,
                                    TotalGHGEmissions_test,
                                    'TotalGHGEmissions', paramlistEmissions,
                                    score, write_data, 'Emissions')


      paramètre      Ridge()
0  ridge__alpha  1668.100537
               R²        RMSE         MAE     MAE%  FitTime(s)
Ridge()  0.250334  784.527445  192.353257  6.71731    0.013076


      paramètre    Lasso()
0  lasso__alpha  64.280731
               R²        RMSE         MAE      MAE%  FitTime(s)
Lasso()  0.269245  774.568874  193.789913  6.031395    0.013693


              paramètre  ElasticNet()
0     elasticnet__alpha     65.793322
1  elasticnet__l1_ratio      1.000000
                    R²        RMSE         MAE      MAE%  FitTime(s)
ElasticNet()  0.268499  774.964307  193.830826  6.051734    0.023589


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      3
                             R²        RMSE         MAE      MAE%  FitTime(s)
KNeighborsRegressor()  0.315976  749.393604  175.542269  2.973326    0.020369


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                      46
1  randomforestregressor__max_features                    log2
                               R²        RMSE         MAE      MAE%  \
RandomForestRegressor()  0.424608  687.315539  135.009202  2.237234   

                         FitTime(s)  
RandomForestRegressor()    0.251053  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  17
1          adaboostregressor__loss         exponential
                           R²        RMSE         MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.437154  679.780713  210.345997  6.896707    0.106484


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                         316
1          gradientboostingregressor__loss                       huber
                                   R²        RMSE         MAE      MAE%  \
GradientBoostingRegressor()  0.411904  694.861843  153.056354  3.024407   

                             FitTime(s)  
GradientBoostingRegressor()    2.809264  


### 1.1.2 Émissions au log

In [7]:
TotalGHGEmissions_train_log = np.log(TotalGHGEmissions_train)
TotalGHGEmissions_test_log = np.log(TotalGHGEmissions_test)


#### 1.1.2.1 Modèle LinearRegression

In [8]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train_log)

TotalGHGEmissions_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(TotalGHGEmissions_test_log,
                            TotalGHGEmissions_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(TotalGHGEmissions_test_log,
                                        TotalGHGEmissions_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_pred_logLR.squeeze(),
    y=TotalGHGEmissions_test_log.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_pred_logLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test_log=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsLR_log.pdf')


r2 : 0.37316476657598696
rmse : 717.3827369672941


#### 1.1.2.2 Comparaison des modèles sur les émissions au log

In [9]:
paramlistEmissions_log = [{
    'ridge__alpha': np.logspace(3, 5, 100)
}, {
    'lasso__alpha': np.logspace(-2, 0, 100)
}, {
    'elasticnet__alpha': np.logspace(-1, 1, 10),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(3, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train_log,
                                        TotalGHGEmissions_test_log,
                                        'TotalGHGEmissions_log',
                                        paramlistEmissions_log, score,
                                        write_data, 'Emissions', '_log')


      paramètre      Ridge()
0  ridge__alpha  5590.810183
               R²        RMSE         MAE      MAE%  FitTime(s)
Ridge()  0.173793  888.306568  182.935063  2.805038    0.016896


      paramètre   Lasso()
0  lasso__alpha  0.343047
               R²        RMSE         MAE      MAE%  FitTime(s)
Lasso()  0.131883  900.773739  182.601502  2.985967      0.0168


              paramètre  ElasticNet()
0     elasticnet__alpha       1.29155
1  elasticnet__l1_ratio       0.10000
                  R²        RMSE         MAE      MAE%  FitTime(s)
ElasticNet()  0.1659  893.724948  180.935718  2.842884    0.020944


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²        RMSE         MAE      MAE%  FitTime(s)
KNeighborsRegressor()  0.591311  662.797302  118.435981  1.260544     0.01537


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                    1000
1  randomforestregressor__max_features                    log2
                               R²        RMSE         MAE      MAE%  \
RandomForestRegressor()  0.657585  738.017201  132.494709  1.039446   

                         FitTime(s)  
RandomForestRegressor()    4.418134  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  11
1          adaboostregressor__loss         exponential
                           R²        RMSE         MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.342788  793.852722  168.492969  1.642511    0.090472


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        3162
1          gradientboostingregressor__loss               squared_error
                                   R²        RMSE         MAE      MAE%  \
GradientBoostingRegressor()  0.680218  650.169627  117.394626  0.954003   

                             FitTime(s)  
GradientBoostingRegressor()   11.088456  


In [10]:
EmissionsScores = pd.DataFrame().append(
    [val for key, val in ResultEmissions.items() if key.startswith('Score')])


In [11]:
EmissionsScoresLog = pd.DataFrame().append([
    val for key, val in ResultEmissions_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [12]:
EmissionsCompareScores = EmissionsScores.append(EmissionsScoresLog)
if write_data is True:
    EmissionsCompareScores.to_latex('./Tableaux/EmissionsScoresModèles.tex')
EmissionsCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.250334,784.527445,192.353257,6.71731,0.013076
Lasso(),0.269245,774.568874,193.789913,6.031395,0.013693
ElasticNet(),0.268499,774.964307,193.830826,6.051734,0.023589
KNeighborsRegressor(),0.315976,749.393604,175.542269,2.973326,0.020369
RandomForestRegressor(),0.424608,687.315539,135.009202,2.237234,0.251053
AdaBoostRegressor(),0.437154,679.780713,210.345997,6.896707,0.106484
GradientBoostingRegressor(),0.411904,694.861843,153.056354,3.024407,2.809264
Ridge()_log,0.173793,888.306568,182.935063,2.805038,0.016896
Lasso()_log,0.131883,900.773739,182.601502,2.985967,0.0168
ElasticNet()_log,0.1659,893.724948,180.935718,2.842884,0.020944


In [13]:
fig = make_subplots(len(EmissionsScores.columns),
                    2,
                    column_titles=("Émissions brutes", "Émissions log"),
                    row_titles=(EmissionsScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(EmissionsScores):
    fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(title_text="Comparaison des scores des modèles d'émissions",
                  showlegend=False,
                  height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScores.pdf', height=700)


Afin de voir si l'energy star score permet d'améliorer le modèle nous allons
voir si le meilleurs modèle est amélioré avec cette variable.
Je choisi d'utiliser le modèle GradientBoosting avec la variable au log
car c'est le modèle ayant la RMSE la plus faible

In [28]:
BEBESSNum = pd.read_csv('BEBESSNum.csv')

BEBESSNumM = BEBESSNum.drop(
    columns={'SiteEnergyUse(kBtu)', 'TotalGHGEmissions'})
SiteEnergyUseESS = np.array(BEBESSNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissionsESS = np.array(BEBESSNum.TotalGHGEmissions).reshape(-1, 1)

BEBESSNumM_train, BEBESSNumM_test, TotalGHGEmissionsESS_train, TotalGHGEmissionsESS_test = train_test_split(
    BEBESSNumM, TotalGHGEmissionsESS, test_size=.2)

TotalGHGEmissionsESS_train_log = np.log(TotalGHGEmissionsESS_train)
TotalGHGEmissionsESS_test_log = np.log(TotalGHGEmissionsESS_test)


In [38]:
BestParamEmissionsGB = ResultEmissions[
    'BestParamGradientBoostingRegressor'].set_index('paramètre')
paramlistEmissionsESS = [{
    'gradientboostingregressor__n_estimators': [
        int(BestParamEmissionsGB.
            loc['gradientboostingregressor__n_estimators'].values)
    ],
    'gradientboostingregressor__loss': [
        *BestParamEmissionsGB.loc[
            'gradientboostingregressor__loss', :].values
    ]
}]
ResultEmissionsESS = compareGridModels([GradientBoostingRegressor()],
                                           scaler,
                                           BEBESSNumM_train,
                                           BEBESSNumM_test,
                                           TotalGHGEmissionsESS_train_log,
                                           TotalGHGEmissionsESS_test_log,
                                           'TotalGHGEmissionsESS_log',
                                           paramlistEmissionsESS,
                                           score,
                                           write_data=write_data,
                                           prefix='EmissionsESS',
                                           suffix='_log',
                                           plotfigRMSE=False)


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                         316
1          gradientboostingregressor__loss                       huber
                                   R²       RMSE        MAE     MAE%  \
GradientBoostingRegressor()  0.628744  304.87616  74.723065  1.11451   

                             FitTime(s)  
GradientBoostingRegressor()    1.742993  


In [39]:
EmissionsScoresESS = pd.DataFrame().append([
    val for key, val in ResultEmissionsESS.items()
    if key.startswith('Score')
]).rename('{}_ESS'.format)
CompareScoresESS = EmissionsScores.append(EmissionsScoresESS).drop(
    columns=('FitTime(s)')).loc[[
        'GradientBoostingRegressor()', 'GradientBoostingRegressor()_ESS'
    ]]


In [40]:
fig = make_subplots(1,
                    len(CompareScoresESS.columns),
                    column_titles=(CompareScoresESS.columns.to_list()))
for c, col in enumerate(CompareScoresESS.columns):
    fig.add_trace(go.Bar(x=CompareScoresESS.index, y=CompareScoresESS[col]),
                  row=1,
                  col=c + 1)
fig.update_layout(
    title_text="Comparaison avec et sans ajout de l'energy score stars",
    showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScoresESS.pdf')


# 2. Modèle de prédiction sur la consommation énergétique (SiteEnergyUse)
## 2.1 Avec les données numériques uniquement
### 2.1.1 Consommation énergétique brute

In [18]:
BEBNumM_train, BEBNumM_test, SiteEnergyUse_train, SiteEnergyUse_test = train_test_split(
    BEBNumM, SiteEnergyUse, test_size=.2)


#### 2.1.1.1 Modèle LinearRegression

In [19]:
#modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train)

SiteEnergyUse_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(SiteEnergyUse_test, SiteEnergyUse_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(SiteEnergyUse_test,
                                    SiteEnergyUse_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_predLR.squeeze(),
    y=SiteEnergyUse_test.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_predLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoLR.pdf')


r2 : 0.6431524795929406
rmse : 9741752.355479544


#### 2.1.1.2 Comparaison des modèles sur la consommation

In [20]:
paramlistConso = [{
    'ridge__alpha': np.logspace(-3, 5, 100)
}, {
    'lasso__alpha': np.logspace(0.1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 3, 200),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 3, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train,
                                SiteEnergyUse_test, 'SiteEnergyUse',
                                paramlistConso, score, write_data, 'Conso')


      paramètre     Ridge()
0  ridge__alpha  215.443469
               R²          RMSE           MAE     MAE%  FitTime(s)
Ridge()  0.660724  9.498872e+06  4.216743e+06  1.66218    0.012963


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre  Lasso()
0  lasso__alpha   1000.0
               R²          RMSE           MAE      MAE%  FitTime(s)
Lasso()  0.643247  9.740467e+06  4.523528e+06  1.741807    0.029424


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha      0.120338
1  elasticnet__l1_ratio      0.100000
                    R²          RMSE           MAE      MAE%  FitTime(s)
ElasticNet()  0.660489  9.502169e+06  4.200469e+06  1.664405    0.031856


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      3
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.581805  1.054593e+07  3.852872e+06  0.964003   

                       FitTime(s)  
KNeighborsRegressor()    0.014934  


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    auto
                              R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.76549  7.897253e+06  2.655420e+06  0.646587   

                         FitTime(s)  
RandomForestRegressor()    5.498544  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   9
1          adaboostregressor__loss         exponential
                           R²          RMSE           MAE      MAE%  \
AdaBoostRegressor()  0.655085  9.577498e+06  4.797313e+06  2.144432   

                     FitTime(s)  
AdaBoostRegressor()    0.057592  


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1000
1          gradientboostingregressor__loss               squared_error
                                   R²          RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.621066  1.003871e+07  2.819213e+06  0.891096   

                             FitTime(s)  
GradientBoostingRegressor()    3.271127  


### 2.1.2 Consommation énergétique au log

In [21]:
SiteEnergyUse_train_log = np.log(SiteEnergyUse_train)
SiteEnergyUse_test_log = np.log(SiteEnergyUse_test)


#### 2.1.2.1 Modèle LinearRegression

In [22]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train_log)

SiteEnergyUse_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(SiteEnergyUse_test_log, SiteEnergyUse_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(SiteEnergyUse_test_log,
                                        SiteEnergyUse_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_pred_logLR.squeeze(),
    y=SiteEnergyUse_test_log.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_pred_logLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test_log=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoLR_log.pdf')

r2 : 0.6431524795929406
rmse : 9741752.355479544


#### 2.1.2.2 Comparaison des modèles sur la consommation au log

In [23]:
paramlistConso_log = [{
    'ridge__alpha': np.logspace(1, 4, 100)
}, {
    'lasso__alpha': np.logspace(-3, 0, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 1, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(1, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train_log,
                                    SiteEnergyUse_test_log,
                                    'SiteEnergyUse_log', paramlistConso_log,
                                    score, write_data, 'Conso', '_log')


      paramètre      Ridge()
0  ridge__alpha  3764.935807
               R²          RMSE           MAE      MAE%  FitTime(s)
Ridge()  0.302363  3.131493e+07  6.430475e+06  1.276291     0.01452


      paramètre   Lasso()
0  lasso__alpha  0.114976
              R²          RMSE           MAE      MAE%  FitTime(s)
Lasso()  0.31829  7.063674e+07  8.470586e+06  1.257347    0.016107


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


              paramètre  ElasticNet()
0     elasticnet__alpha      0.890215
1  elasticnet__l1_ratio      0.100000
                   R²          RMSE           MAE      MAE%  FitTime(s)
ElasticNet()  0.29911  2.783176e+07  6.264807e+06  1.280845    0.018144


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.763973  6.523064e+06  2.125361e+06  0.403827   

                       FitTime(s)  
KNeighborsRegressor()    0.016894  


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                    1000
1  randomforestregressor__max_features                    sqrt
                               R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.806324  6.699738e+06  2.444802e+06  0.429887   

                         FitTime(s)  
RandomForestRegressor()    5.902729  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   9
1          adaboostregressor__loss         exponential
                           R²          RMSE           MAE      MAE%  \
AdaBoostRegressor()  0.543412  1.194648e+07  4.171711e+06  0.802644   

                     FitTime(s)  
AdaBoostRegressor()    0.056481  


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss                       huber
                                   R²          RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.792071  5.284323e+06  1.713174e+06  0.368268   

                             FitTime(s)  
GradientBoostingRegressor()   77.929193  


In [24]:
ConsoScores = pd.DataFrame().append(
    [val for key, val in ResultConso.items() if key.startswith('Score')])


In [25]:
ConsoScoresLog = pd.DataFrame().append([
    val for key, val in ResultConso_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [26]:
ConsoCompareScores = ConsoScores.append(ConsoScoresLog)
if write_data is True:
    ConsoCompareScores.to_latex('./Tableaux/ConsoScoresModèles.tex')
ConsoCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.660724,9498872.0,4216743.0,1.66218,0.012963
Lasso(),0.643247,9740467.0,4523528.0,1.741807,0.029424
ElasticNet(),0.660489,9502169.0,4200469.0,1.664405,0.031856
KNeighborsRegressor(),0.581805,10545930.0,3852872.0,0.964003,0.014934
RandomForestRegressor(),0.76549,7897253.0,2655420.0,0.646587,5.498544
AdaBoostRegressor(),0.655085,9577498.0,4797313.0,2.144432,0.057592
GradientBoostingRegressor(),0.621066,10038710.0,2819213.0,0.891096,3.271127
Ridge()_log,0.302363,31314930.0,6430475.0,1.276291,0.01452
Lasso()_log,0.31829,70636740.0,8470586.0,1.257347,0.016107
ElasticNet()_log,0.29911,27831760.0,6264807.0,1.280845,0.018144


In [27]:
fig = make_subplots(len(ConsoScores.columns),
                    2,
                    column_titles=("Consommation brute", "Consommation log"),
                    row_titles=(ConsoScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(ConsoScores):
    fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(
    title_text="Comparaison des scores des modèles de consommation",
    showlegend=False,
    height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoCompareScores.pdf', height=700)
