In [54]:
import os
import pandas as pd

pd.options.plotting.backend = 'plotly'
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, \
                             GradientBoostingRegressor

from Pélec_04_fonctions import reg_modelGrid, visuRMSEGrid, compareModels


In [55]:
write_data = True

if write_data is True:
    try:
        os.mkdir("./Figures/")
    except OSError as error:
        print(error)
    try:
        os.mkdir("./Tableaux/")
    except OSError as error:
        print(error)
else:
    print("""Visualisation uniquement dans le notebook
    pas de création de figures ni de tableaux""")


[Errno 17] File exists: './Figures/'
[Errno 17] File exists: './Tableaux/'


In [56]:
BEBNum = pd.read_csv('BEBNum.csv')

BEBNumM = BEBNum.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBNum.TotalGHGEmissions).reshape(-1, 1)

BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train, TotalGHGEmissions_test = train_test_split(
    BEBNumM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


In [57]:
# Scaler moins sensible aux outlier d'après la doc
scaler = RobustScaler(quantile_range=(10, 90))


# 1. Modèle de prédiction sur les émissions (TotalGHGEmissions)
## 1.1 Avec les données numériques uniquement
### 1.1.1 Émissions brutes

#### 1.1.1.1 Modèle LinearRegression

In [58]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train)

TotalGHGEmissions_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(TotalGHGEmissions_test, TotalGHGEmissions_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(TotalGHGEmissions_test,
                                    TotalGHGEmissions_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_predLR.squeeze(),
    y=TotalGHGEmissions_test.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_predLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()


r2 : 0.37519125628182937
rmse : 510.6736711343238


#### 1.1.1.2 Comparaison de différents modèles sur les émissions brutes

In [59]:
paramlistEmissions = [{
    'ridge__alpha': np.logspace(1, 5, 100)
}, {
    'lasso__alpha': np.logspace(0, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(0, 3, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(0, 3, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]
ResultEmissions = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train,
                                TotalGHGEmissions_test, 'TotalGHGEmissions',
                                paramlistEmissions, score, write_data,
                                'Emissions')


      paramètre      Ridge()
0  ridge__alpha  8111.308308
               R²        RMSE         MAE      MAE%
Ridge()  0.313165  535.422041  112.413325  4.334449


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre    Lasso()
0  lasso__alpha  53.366992
               R²        RMSE         MAE      MAE%
Lasso()  0.317954  533.551918  111.919887  4.258501


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha     53.366992
1  elasticnet__l1_ratio      1.000000
                    R²        RMSE         MAE      MAE%
ElasticNet()  0.317954  533.551918  111.919887  4.258501


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      9
                            R²       RMSE        MAE      MAE%
KNeighborsRegressor()  0.36223  515.94321  112.98753  3.126285


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                      46
1  randomforestregressor__max_features                    log2
                               R²        RMSE        MAE      MAE%
RandomForestRegressor()  0.505988  454.086762  77.991666  1.714609


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   1
1          adaboostregressor__loss              square
                           R²       RMSE         MAE      MAE%
AdaBoostRegressor()  0.334371  527.09152  108.876341  3.578896


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1000
1          gradientboostingregressor__loss                       huber
                                   R²        RMSE        MAE      MAE%
GradientBoostingRegressor()  0.525415  445.068624  82.429281  2.401603


### 1.1.2 Émissions au log

In [60]:
TotalGHGEmissions_train_log = np.log(TotalGHGEmissions_train)
TotalGHGEmissions_test_log = np.log(TotalGHGEmissions_test)


#### 1.1.2.1 Modèle LinearRegression

In [61]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train_log)

TotalGHGEmissions_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(TotalGHGEmissions_test_log,
                            TotalGHGEmissions_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(TotalGHGEmissions_test_log,
                                        TotalGHGEmissions_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_pred_logLR.squeeze(),
    y=TotalGHGEmissions_test_log.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_pred_logLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test_log=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()

r2 : 0.37519125628182937
rmse : 510.6736711343238


#### 1.1.2.2 Comparaison des modèles sur les émissions au log

In [62]:
paramlistEmissions_log = [{
    'ridge__alpha': np.logspace(3, 6, 100)
}, {
    'lasso__alpha': np.logspace(-2, 1, 100)
}, {
    'elasticnet__alpha': np.logspace(-1, 2, 10),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(0, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions_log = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train_log,
                                    TotalGHGEmissions_test_log,
                                    'TotalGHGEmissions_log',
                                    paramlistEmissions_log, score, write_data,
                                    'Emissions', '_log')


      paramètre       Ridge()
0  ridge__alpha  75646.332755
               R²       RMSE         MAE      MAE%
Ridge()  0.194301  758.22876  131.725315  1.988766


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre   Lasso()
0  lasso__alpha  0.811131
               R²        RMSE         MAE      MAE%
Lasso()  0.160301  672.553499  124.999033  2.029269


              paramètre  ElasticNet()
0     elasticnet__alpha          1.00
1  elasticnet__l1_ratio          0.82
                    R²        RMSE         MAE      MAE%
ElasticNet()  0.158133  671.637992  124.863299  2.031804


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                     31
                             R²        RMSE         MAE      MAE%
KNeighborsRegressor()  0.411621  585.654232  102.718057  1.902883


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    sqrt
                               R²        RMSE        MAE      MAE%
RandomForestRegressor()  0.674328  525.497063  78.598913  0.924951


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  10
1          adaboostregressor__loss              linear
                           R²      RMSE         MAE      MAE%
AdaBoostRegressor()  0.430742  578.4609  100.610781  1.669767


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss               squared_error
                                   R²        RMSE        MAE      MAE%
GradientBoostingRegressor()  0.729427  442.416959  57.343628  0.852787


In [63]:
EmissionsScores = pd.DataFrame().append(
    [val for key, val in ResultEmissions.items() if key.startswith('Score')])


In [64]:
EmissionsScoresLog = pd.DataFrame().append([
    val for key, val in ResultEmissions_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [65]:
EmissionsCompareScores = EmissionsScores.append(EmissionsScoresLog)
if write_data is True:
    EmissionsCompareScores.to_latex('./Tableaux/EmissionsScoresModèles.tex')
EmissionsCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%
Ridge(),0.313165,535.422041,112.413325,4.334449
Lasso(),0.317954,533.551918,111.919887,4.258501
ElasticNet(),0.317954,533.551918,111.919887,4.258501
KNeighborsRegressor(),0.36223,515.94321,112.98753,3.126285
RandomForestRegressor(),0.505988,454.086762,77.991666,1.714609
AdaBoostRegressor(),0.334371,527.09152,108.876341,3.578896
GradientBoostingRegressor(),0.525415,445.068624,82.429281,2.401603
Ridge()_log,0.194301,758.22876,131.725315,1.988766
Lasso()_log,0.160301,672.553499,124.999033,2.029269
ElasticNet()_log,0.158133,671.637992,124.863299,2.031804


In [67]:
fig = make_subplots(4,
                    2,
                    column_titles=("Émissions brutes", "Émissions log"),
                    row_titles=('R²', 'RMSE', 'MAE', 'MAE%'),
                    shared_xaxes=True)
fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores['R²']),
              row=1,
              col=1)
fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores['RMSE']),
              row=2,
              col=1)
fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores['MAE']),
              row=3,
              col=1)
fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores['MAE%']),
              row=4,
              col=1)
fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog['R²']),
              row=1,
              col=2)
fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog['RMSE']),
              row=2,
              col=2)
fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog['MAE']),
              row=3,
              col=2)
fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog['MAE%']),
              row=4,
              col=2)
fig.update_layout(title_text="Comparaison des scores des modèles d'émissions",
                  showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScores.pdf', height=600)
    

## 1.2 Avec les données catégorielles

In [68]:
BEBCat = pd.read_csv('BEBCat.csv')

BEBCatM = BEBCat.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBCat['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBCat.TotalGHGEmissions).reshape(-1, 1)

BEBCatM_train, BEBCatM_test, TotalGHGEmissionsCat_train, TotalGHGEmissionsCat_test = train_test_split(
    BEBCatM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


# 2. Modèle de prédiction sur la consommation énergétique (SiteEnergyUse)
## 2.1 Avec les données numériques uniquement
### 2.1.1 Consommation énergétique brute

In [69]:
BEBNumM_train, BEBNumM_test, SiteEnergyUse_train, SiteEnergyUse_test = train_test_split(
    BEBNumM, SiteEnergyUse, test_size=.2)


#### 2.1.1.1 Modèle LinearRegression

In [70]:
#modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train)

SiteEnergyUse_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(SiteEnergyUse_test, SiteEnergyUse_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(SiteEnergyUse_test,
                                    SiteEnergyUse_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_predLR.squeeze(),
    y=SiteEnergyUse_test.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_predLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.41882572445745614
rmse : 17978301.101778716


#### 2.1.1.2 Comparaison des modèles sur la consommation

In [71]:
paramlistConso = [{
    'ridge__alpha': np.logspace(-3, 5, 100)
}, {
    'lasso__alpha': np.logspace(0.1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 3, 200),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(0, 3, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, SiteEnergyUse_train,
                            SiteEnergyUse_test, 'SiteEnergyUse',
                            paramlistConso, score, write_data, 'Conso')


      paramètre  Ridge()
0  ridge__alpha    0.001
               R²          RMSE           MAE      MAE%
Ridge()  0.418812  1.797852e+07  3.841380e+06  1.094225


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre   Lasso()
0  lasso__alpha  1.258925
               R²          RMSE           MAE      MAE%
Lasso()  0.418812  1.797852e+07  3.841380e+06  1.094225


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha         0.001
1  elasticnet__l1_ratio         1.000
                    R²          RMSE           MAE      MAE%
ElasticNet()  0.418812  1.797852e+07  3.841380e+06  1.094225


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      9
                             R²          RMSE           MAE      MAE%
KNeighborsRegressor()  0.394685  1.834790e+07  4.085776e+06  1.218962


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                      46
1  randomforestregressor__max_features                    sqrt
                               R²          RMSE           MAE      MAE%
RandomForestRegressor()  0.565365  1.554740e+07  2.738405e+06  0.746521


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   1
1          adaboostregressor__loss              linear
                          R²          RMSE           MAE     MAE%
AdaBoostRegressor()  0.30436  1.966925e+07  4.362719e+06  1.58314


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                          31
1          gradientboostingregressor__loss               squared_error
                                   R²          RMSE           MAE      MAE%
GradientBoostingRegressor()  0.496507  1.673370e+07  3.636989e+06  1.296282


### 2.1.2 Consommation énergétique au log

In [72]:
SiteEnergyUse_train_log = np.log(SiteEnergyUse_train)
SiteEnergyUse_test_log = np.log(SiteEnergyUse_test)


#### 2.1.2.1 Modèle LinearRegression

In [73]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train_log)

SiteEnergyUse_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(SiteEnergyUse_test_log, SiteEnergyUse_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(SiteEnergyUse_test_log,
                                        SiteEnergyUse_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_pred_logLR.squeeze(),
    y=SiteEnergyUse_test_log.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_pred_logLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test_log=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.41882572445745614
rmse : 17978301.101778716


#### 2.1.2.2 Comparaison des modèles sur la consommation au log

In [77]:
paramlistConso_log = [{
    'ridge__alpha': np.logspace(3, 6, 100)
}, {
    'lasso__alpha': np.logspace(-1, 1, 20)
}, {
    'elasticnet__alpha': np.logspace(-1, 3, 200),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(1, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso_log = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, SiteEnergyUse_train_log,
                                SiteEnergyUse_test_log, 'SiteEnergyUse_log',
                                paramlistConso_log, score, write_data, 'Conso',
                                '_log')


      paramètre       Ridge()
0  ridge__alpha  70548.023107
               R²          RMSE           MAE      MAE%
Ridge()  0.325426  1.162957e+08  1.041828e+07  1.149308


      paramètre   Lasso()
0  lasso__alpha  0.695193
              R²          RMSE           MAE      MAE%
Lasso()  0.28252  1.129078e+08  1.029650e+07  1.178272


              paramètre  ElasticNet()
0     elasticnet__alpha      0.698588
1  elasticnet__l1_ratio      1.000000
                    R²          RMSE           MAE      MAE%
ElasticNet()  0.281995  1.128540e+08  1.029483e+07  1.178556


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                     29
                             R²          RMSE           MAE      MAE%
KNeighborsRegressor()  0.625568  2.010944e+07  3.967146e+06  0.825588


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                    1000
1  randomforestregressor__max_features                    log2
                               R²          RMSE           MAE      MAE%
RandomForestRegressor()  0.831411  1.719745e+07  2.690644e+06  0.461309


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   6
1          adaboostregressor__loss              square
                          R²          RMSE           MAE      MAE%
AdaBoostRegressor()  0.64748  1.956969e+07  3.705047e+06  0.756764


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss               squared_error
                                  R²          RMSE           MAE      MAE%
GradientBoostingRegressor()  0.82318  2.419918e+07  2.629798e+06  0.363627


In [78]:
ConsoScores = pd.DataFrame().append(
    [val for key, val in ResultConso.items() if key.startswith('Score')])


In [79]:
ConsoScoresLog = pd.DataFrame().append([
    val for key, val in ResultConso_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [80]:
ConsoCompareScores = ConsoScores.append(ConsoScoresLog)
if write_data is True:
    ConsoCompareScores.to_latex('./Tableaux/ConsoScoresModèles.tex')
ConsoCompareScores

Unnamed: 0,R²,RMSE,MAE,MAE%
Ridge(),0.418812,17978520.0,3841380.0,1.094225
Lasso(),0.418812,17978520.0,3841380.0,1.094225
ElasticNet(),0.418812,17978520.0,3841380.0,1.094225
KNeighborsRegressor(),0.394685,18347900.0,4085776.0,1.218962
RandomForestRegressor(),0.565365,15547400.0,2738405.0,0.746521
AdaBoostRegressor(),0.30436,19669250.0,4362719.0,1.58314
GradientBoostingRegressor(),0.496507,16733700.0,3636989.0,1.296282
Ridge()_log,0.325426,116295700.0,10418280.0,1.149308
Lasso()_log,0.28252,112907800.0,10296500.0,1.178272
ElasticNet()_log,0.281995,112854000.0,10294830.0,1.178556


In [82]:
fig = make_subplots(4,
                    2,
                    column_titles=("Consommation brute", "Consommation log2"),
                    row_titles=('R²', 'RMSE', 'MAE', 'MAE%'),
                    shared_xaxes=True)
fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores['R²']), row=1, col=1)
fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores['RMSE']), row=2, col=1)
fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores['MAE']), row=3, col=1)
fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores['MAE%']), row=4, col=1)
fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog['R²']),
              row=1,
              col=2)
fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog['RMSE']),
              row=2,
              col=2)
fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog['MAE']),
              row=3,
              col=2)
fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog['MAE%']),
              row=4,
              col=2)
fig.update_layout(
    title_text="Comparaison des scores des modèles de consommation",
    showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoCompareScores.pdf', height=600)