In [1]:
import os
import pandas as pd
pd.options.plotting.backend = 'plotly'
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, \
                             GradientBoostingRegressor

from Pélec_03_fonctions import reg_modelGrid, visuRMSEGrid, compareGridModels


In [19]:
write_data = True

if write_data is True:
    try:
        os.mkdir("./Figures/")
    except OSError as error:
        print(error)
    try:
        os.mkdir("./Tableaux/")
    except OSError as error:
        print(error)
else:
    print("""Visualisation uniquement dans le notebook
    pas de création de figures ni de tableaux""")


[Errno 17] File exists: './Figures/'
[Errno 17] File exists: './Tableaux/'


In [20]:
BEBNum = pd.read_csv('BEBNum.csv')

BEBNumM = BEBNum.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBNum.TotalGHGEmissions).reshape(-1, 1)

BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train, TotalGHGEmissions_test = train_test_split(
    BEBNumM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


In [21]:
# Scaler moins sensible aux outlier d'après la doc
scaler = RobustScaler(quantile_range=(10, 90))


# 1. Modèle de prédiction sur les émissions (TotalGHGEmissions)
## 1.1 Avec les données numériques uniquement
### 1.1.1 Émissions brutes

#### 1.1.1.1 Modèle LinearRegression

In [22]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train)

TotalGHGEmissions_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(TotalGHGEmissions_test, TotalGHGEmissions_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(TotalGHGEmissions_test,
                                    TotalGHGEmissions_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_predLR.squeeze(),
    y=TotalGHGEmissions_test.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_predLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsLR.pdf')


r2 : 0.3293463112796977
rmse : 398.4149982475936


#### 1.1.1.2 Comparaison de différents modèles sur les émissions brutes

In [23]:
paramlistEmissions = [{
    'ridge__alpha': np.logspace(1, 5, 100)
}, {
    'lasso__alpha': np.logspace(1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(0, 3, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train,
                                    TotalGHGEmissions_test,
                                    'TotalGHGEmissions', paramlistEmissions,
                                    score, write_data, 'Emissions')


      paramètre      Ridge()
0  ridge__alpha  5094.138015
               R²        RMSE         MAE      MAE%  FitTime(s)
Ridge()  0.241172  423.797291  150.951785  5.719825    0.013083




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



      paramètre     Lasso()
0  lasso__alpha  178.864953
               R²       RMSE         MAE      MAE%  FitTime(s)
Lasso()  0.261973  417.94834  150.970888  5.521323    0.021757




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


              paramètre  ElasticNet()
0     elasticnet__alpha     174.75284
1  elasticnet__l1_ratio       1.00000
                    R²        RMSE        MAE      MAE%  FitTime(s)
ElasticNet()  0.263465  417.525822  150.73011  5.482311    0.013345




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      3
                             R²        RMSE         MAE      MAE%  FitTime(s)
KNeighborsRegressor()  0.260219  418.444809  119.521937  1.990176    0.016364




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                    1000
1  randomforestregressor__max_features                    auto
                               R²        RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.416827  371.522582  89.729861  1.442676   11.481881


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  19
1          adaboostregressor__loss              square
                          R²        RMSE        MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.47719  351.769756  136.67489  4.990736     0.09266


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        3162
1          gradientboostingregressor__loss               squared_error
                                   R²        RMSE       MAE      MAE%  \
GradientBoostingRegressor()  0.465023  355.839509  74.99128  1.337692   

                             FitTime(s)  
GradientBoostingRegressor()    10.36789  


### 1.1.2 Émissions au log

In [24]:
TotalGHGEmissions_train_log = np.log(TotalGHGEmissions_train)
TotalGHGEmissions_test_log = np.log(TotalGHGEmissions_test)


#### 1.1.2.1 Modèle LinearRegression

In [25]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train_log)

TotalGHGEmissions_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(TotalGHGEmissions_test_log,
                            TotalGHGEmissions_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(TotalGHGEmissions_test_log,
                                        TotalGHGEmissions_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_pred_logLR.squeeze(),
    y=TotalGHGEmissions_test_log.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_pred_logLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test_log=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsLR_log.pdf')


r2 : 0.3293463112796977
rmse : 398.4149982475936


#### 1.1.2.2 Comparaison des modèles sur les émissions au log

In [26]:
paramlistEmissions_log = [{
    'ridge__alpha': np.logspace(3, 5, 100)
}, {
    'lasso__alpha': np.logspace(-2, 0, 100)
}, {
    'elasticnet__alpha': np.logspace(-1, 1, 10),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(3, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train_log,
                                        TotalGHGEmissions_test_log,
                                        'TotalGHGEmissions_log',
                                        paramlistEmissions_log, score,
                                        write_data, 'Emissions', '_log')


      paramètre      Ridge()
0  ridge__alpha  6428.073117
               R²        RMSE         MAE      MAE%  FitTime(s)
Ridge()  0.163857  487.855569  135.348434  2.120725    0.017445




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



      paramètre   Lasso()
0  lasso__alpha  0.343047
               R²        RMSE         MAE      MAE%  FitTime(s)
Lasso()  0.122273  490.727366  136.128617  2.247022    0.018773




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



              paramètre  ElasticNet()
0     elasticnet__alpha       1.29155
1  elasticnet__l1_ratio       0.10000
                    R²        RMSE         MAE      MAE%  FitTime(s)
ElasticNet()  0.159221  487.751698  134.576962  2.134541    0.016376




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²        RMSE        MAE     MAE%  FitTime(s)
KNeighborsRegressor()  0.517261  401.168762  73.267492  0.75131    0.015549




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    sqrt
                               R²        RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.680977  381.250173  85.761009  0.715415    3.009714


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  15
1          adaboostregressor__loss              linear
                           R²        RMSE         MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.357582  404.355947  118.820779  1.268086    0.087605


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        5623
1          gradientboostingregressor__loss                       huber
                                   R²        RMSE        MAE      MAE%  \
GradientBoostingRegressor()  0.633756  340.243282  71.595212  0.803876   

                             FitTime(s)  
GradientBoostingRegressor()   55.912648  


In [27]:
EmissionsScores = pd.DataFrame().append(
    [val for key, val in ResultEmissions.items() if key.startswith('Score')])


In [28]:
EmissionsScoresLog = pd.DataFrame().append([
    val for key, val in ResultEmissions_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [29]:
EmissionsCompareScores = EmissionsScores.append(EmissionsScoresLog)
if write_data is True:
    EmissionsCompareScores.to_latex('./Tableaux/EmissionsScoresModèles.tex')
EmissionsCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.241172,423.797291,150.951785,5.719825,0.013083
Lasso(),0.261973,417.94834,150.970888,5.521323,0.021757
ElasticNet(),0.263465,417.525822,150.73011,5.482311,0.013345
KNeighborsRegressor(),0.260219,418.444809,119.521937,1.990176,0.016364
RandomForestRegressor(),0.416827,371.522582,89.729861,1.442676,11.481881
AdaBoostRegressor(),0.47719,351.769756,136.67489,4.990736,0.09266
GradientBoostingRegressor(),0.465023,355.839509,74.99128,1.337692,10.36789
Ridge()_log,0.163857,487.855569,135.348434,2.120725,0.017445
Lasso()_log,0.122273,490.727366,136.128617,2.247022,0.018773
ElasticNet()_log,0.159221,487.751698,134.576962,2.134541,0.016376


In [30]:
fig = make_subplots(len(EmissionsScores.columns),
                    2,
                    column_titles=("Émissions brutes", "Émissions log"),
                    row_titles=(EmissionsScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(EmissionsScores):
    fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(title_text="Comparaison des scores des modèles d'émissions",
                  showlegend=False,
                  height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScores.pdf', height=700)


Afin de voir si l'energy star score permet d'améliorer le modèle nous allons
voir si le meilleurs modèle est amélioré avec cette variable.
Je choisi d'utiliser le modèle GradientBoosting avec la variable au log
car c'est le modèle ayant la RMSE la plus faible

In [31]:
BEBESSNum = pd.read_csv('BEBESSNum.csv')

BEBESSNumM = BEBESSNum.drop(
    columns={'SiteEnergyUse(kBtu)', 'TotalGHGEmissions'})
SiteEnergyUseESS = np.array(BEBESSNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissionsESS = np.array(BEBESSNum.TotalGHGEmissions).reshape(-1, 1)

BEBESSNumM_train, BEBESSNumM_test, TotalGHGEmissionsESS_train, TotalGHGEmissionsESS_test = train_test_split(
    BEBESSNumM, TotalGHGEmissionsESS, test_size=.2)

TotalGHGEmissionsESS_train_log = np.log(TotalGHGEmissionsESS_train)
TotalGHGEmissionsESS_test_log = np.log(TotalGHGEmissionsESS_test)


In [32]:
BestParamEmissionsGB = ResultEmissions[
    'BestParamGradientBoostingRegressor'].set_index('paramètre')
paramlistEmissionsESS = [{
    'gradientboostingregressor__n_estimators': [
        int(BestParamEmissionsGB.
            loc['n_estimators'].values)
    ],
    'gradientboostingregressor__loss': [
        *BestParamEmissionsGB.loc[
            'loss', :].values
    ]
}]
ResultEmissionsESS = compareGridModels([GradientBoostingRegressor()],
                                           scaler,
                                           BEBESSNumM_train,
                                           BEBESSNumM_test,
                                           TotalGHGEmissionsESS_train_log,
                                           TotalGHGEmissionsESS_test_log,
                                           'TotalGHGEmissionsESS_log',
                                           paramlistEmissionsESS,
                                           score,
                                           write_data=write_data,
                                           prefix='EmissionsESS',
                                           suffix='_log',
                                           plotfigRMSE=False)


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        3162
1          gradientboostingregressor__loss               squared_error
                                   R²        RMSE        MAE      MAE%  \
GradientBoostingRegressor()  0.821292  288.408182  65.056698  0.524198   

                             FitTime(s)  
GradientBoostingRegressor()    6.755357  


In [33]:
EmissionsScoresESS = pd.DataFrame().append([
    val for key, val in ResultEmissionsESS.items()
    if key.startswith('Score')
]).rename('{}_ESS'.format)
CompareScoresESS = EmissionsScores.append(EmissionsScoresESS).drop(
    columns=('FitTime(s)')).loc[[
        'GradientBoostingRegressor()', 'GradientBoostingRegressor()_ESS'
    ]]


In [34]:
fig = make_subplots(1,
                    len(CompareScoresESS.columns),
                    column_titles=(CompareScoresESS.columns.to_list()))
for c, col in enumerate(CompareScoresESS.columns):
    fig.add_trace(go.Bar(x=CompareScoresESS.index, y=CompareScoresESS[col]),
                  row=1,
                  col=c + 1)
fig.update_layout(
    title_text="Comparaison avec et sans ajout de l'energy score stars",
    showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScoresESS.pdf')


# 2. Modèle de prédiction sur la consommation énergétique (SiteEnergyUse)
## 2.1 Avec les données numériques uniquement
### 2.1.1 Consommation énergétique brute

In [35]:
BEBNumM_train, BEBNumM_test, SiteEnergyUse_train, SiteEnergyUse_test = train_test_split(
    BEBNumM, SiteEnergyUse, test_size=.2)


#### 2.1.1.1 Modèle LinearRegression

In [36]:
#modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train)

SiteEnergyUse_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(SiteEnergyUse_test, SiteEnergyUse_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(SiteEnergyUse_test,
                                    SiteEnergyUse_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_predLR.squeeze(),
    y=SiteEnergyUse_test.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_predLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoLR.pdf')


r2 : 0.33918817116102795
rmse : 17526278.50572787


#### 2.1.1.2 Comparaison des modèles sur la consommation

In [37]:
paramlistConso = [{
    'ridge__alpha': np.logspace(-3, 5, 100)
}, {
    'lasso__alpha': np.logspace(0.1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 3, 200),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 3, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train,
                                SiteEnergyUse_test, 'SiteEnergyUse',
                                paramlistConso, score, write_data, 'Conso')


      paramètre     Ridge()
0  ridge__alpha  102.353102
              R²          RMSE           MAE      MAE%  FitTime(s)
Ridge()  0.32906  1.766008e+07  5.153567e+06  1.851842    0.012975




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre  Lasso()
0  lasso__alpha   1000.0
               R²          RMSE           MAE      MAE%  FitTime(s)
Lasso()  0.341221  1.749930e+07  5.269886e+06  1.884583    0.043704




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha      0.091159
1  elasticnet__l1_ratio      0.460000
                    R²          RMSE           MAE      MAE%  FitTime(s)
ElasticNet()  0.328318  1.766984e+07  5.135486e+06  1.852292    0.026708




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      3
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.148773  1.989178e+07  4.958197e+06  1.136623   

                       FitTime(s)  
KNeighborsRegressor()    0.021139  




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                      10
1  randomforestregressor__max_features                    log2
                               R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.431541  1.625550e+07  3.079266e+06  0.847042   

                         FitTime(s)  
RandomForestRegressor()    0.091739  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   3
1          adaboostregressor__loss              linear
                           R²          RMSE           MAE      MAE%  \
AdaBoostRegressor()  0.284296  1.823969e+07  5.482795e+06  2.414335   

                     FitTime(s)  
AdaBoostRegressor()    0.049556  


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1000
1          gradientboostingregressor__loss                       huber
                                   R²          RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.428919  1.629295e+07  2.980172e+06  0.897218   

                             FitTime(s)  
GradientBoostingRegressor()    7.990187  


### 2.1.2 Consommation énergétique au log

In [38]:
SiteEnergyUse_train_log = np.log(SiteEnergyUse_train)
SiteEnergyUse_test_log = np.log(SiteEnergyUse_test)


#### 2.1.2.1 Modèle LinearRegression

In [39]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train_log)

SiteEnergyUse_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(SiteEnergyUse_test_log, SiteEnergyUse_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(SiteEnergyUse_test_log,
                                        SiteEnergyUse_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_pred_logLR.squeeze(),
    y=SiteEnergyUse_test_log.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_pred_logLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test_log=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoLR_log.pdf')

r2 : 0.33918817116102795
rmse : 17526278.50572787


#### 2.1.2.2 Comparaison des modèles sur la consommation au log

In [40]:
paramlistConso_log = [{
    'ridge__alpha': np.logspace(1, 4, 100)
}, {
    'lasso__alpha': np.logspace(-3, 0, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 1, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.linspace(1, 100, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(1, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train_log,
                                    SiteEnergyUse_test_log,
                                    'SiteEnergyUse_log', paramlistConso_log,
                                    score, write_data, 'Conso', '_log')


      paramètre      Ridge()
0  ridge__alpha  3511.191734
               R²          RMSE           MAE      MAE%  FitTime(s)
Ridge()  0.305996  2.104369e+07  5.666821e+06  1.397365    0.017396




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


      paramètre   Lasso()
0  lasso__alpha  0.123285
               R²          RMSE           MAE      MAE%  FitTime(s)
Lasso()  0.317754  2.349626e+07  6.175023e+06  1.376827    0.022548




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


              paramètre  ElasticNet()
0     elasticnet__alpha      0.890215
1  elasticnet__l1_ratio      0.100000
                    R²          RMSE           MAE      MAE%  FitTime(s)
ElasticNet()  0.302803  2.073456e+07  5.593977e+06  1.410332     0.01661




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.751385  1.512579e+07  2.521110e+06  0.550406   

                       FitTime(s)  
KNeighborsRegressor()    0.014299  




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    sqrt
                               R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.797079  1.653380e+07  2.771108e+06  0.510425   

                         FitTime(s)  
RandomForestRegressor()    2.718596  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                  21
1          adaboostregressor__loss         exponential
                           R²          RMSE           MAE      MAE%  \
AdaBoostRegressor()  0.568228  1.710136e+07  4.203073e+06  0.826133   

                     FitTime(s)  
AdaBoostRegressor()    0.131977  


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss                       huber
                                   R²          RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.832052  1.503803e+07  2.135409e+06  0.393559   

                             FitTime(s)  
GradientBoostingRegressor()  107.329609  


In [41]:
ConsoScores = pd.DataFrame().append(
    [val for key, val in ResultConso.items() if key.startswith('Score')])


In [42]:
ConsoScoresLog = pd.DataFrame().append([
    val for key, val in ResultConso_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [43]:
ConsoCompareScores = ConsoScores.append(ConsoScoresLog)
if write_data is True:
    ConsoCompareScores.to_latex('./Tableaux/ConsoScoresModèles.tex')
ConsoCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.32906,17660080.0,5153567.0,1.851842,0.012975
Lasso(),0.341221,17499300.0,5269886.0,1.884583,0.043704
ElasticNet(),0.328318,17669840.0,5135486.0,1.852292,0.026708
KNeighborsRegressor(),0.148773,19891780.0,4958197.0,1.136623,0.021139
RandomForestRegressor(),0.431541,16255500.0,3079266.0,0.847042,0.091739
AdaBoostRegressor(),0.284296,18239690.0,5482795.0,2.414335,0.049556
GradientBoostingRegressor(),0.428919,16292950.0,2980172.0,0.897218,7.990187
Ridge()_log,0.305996,21043690.0,5666821.0,1.397365,0.017396
Lasso()_log,0.317754,23496260.0,6175023.0,1.376827,0.022548
ElasticNet()_log,0.302803,20734560.0,5593977.0,1.410332,0.01661


In [44]:
fig = make_subplots(len(ConsoScores.columns),
                    2,
                    column_titles=("Consommation brute", "Consommation log"),
                    row_titles=(ConsoScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(ConsoScores):
    fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(
    title_text="Comparaison des scores des modèles de consommation",
    showlegend=False,
    height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoCompareScores.pdf', height=700)
