In [37]:
import os
import pandas as pd

pd.options.plotting.backend = 'plotly'
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, \
                             GradientBoostingRegressor

from Pélec_04_fonctions import reg_modelGrid, visuRMSEGrid, compareGridModels


In [38]:
write_data = True

if write_data is True:
    try:
        os.mkdir("./Figures/")
    except OSError as error:
        print(error)
    try:
        os.mkdir("./Tableaux/")
    except OSError as error:
        print(error)
else:
    print("""Visualisation uniquement dans le notebook
    pas de création de figures ni de tableaux""")


[Errno 17] File exists: './Figures/'
[Errno 17] File exists: './Tableaux/'


In [39]:
BEBNum = pd.read_csv('BEBNum.csv')

BEBNumM = BEBNum.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBNum.TotalGHGEmissions).reshape(-1, 1)

BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train, TotalGHGEmissions_test = train_test_split(
    BEBNumM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


In [40]:
# Scaler moins sensible aux outlier d'après la doc
scaler = RobustScaler(quantile_range=(10, 90))


# 1. Modèle de prédiction sur les émissions (TotalGHGEmissions)
## 1.1 Avec les données numériques uniquement
### 1.1.1 Émissions brutes

#### 1.1.1.1 Modèle LinearRegression

In [41]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train)

TotalGHGEmissions_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(TotalGHGEmissions_test, TotalGHGEmissions_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(TotalGHGEmissions_test,
                                    TotalGHGEmissions_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_predLR.squeeze(),
    y=TotalGHGEmissions_test.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_predLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()


r2 : 0.3436931006381113
rmse : 408.1999910653216


#### 1.1.1.2 Comparaison de différents modèles sur les émissions brutes

In [42]:
paramlistEmissions = [{
    'ridge__alpha': np.logspace(1, 5, 100)
}, {
    'lasso__alpha': np.logspace(1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(0, 3, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(1, 3, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train,
                                    TotalGHGEmissions_test,
                                    'TotalGHGEmissions', paramlistEmissions,
                                    score, write_data, 'Emissions')


      paramètre      Ridge()
0  ridge__alpha  1668.100537
               R²       RMSE         MAE      MAE%  FitTime(s)
Ridge()  0.295493  422.92387  102.118478  3.521498    1.479223


      paramètre    Lasso()
0  lasso__alpha  55.908102
               R²        RMSE         MAE      MAE%  FitTime(s)
Lasso()  0.279366  427.736951  102.370419  3.714799    1.686229


              paramètre  ElasticNet()
0     elasticnet__alpha     57.223677
1  elasticnet__l1_ratio      1.000000
                    R²        RMSE         MAE      MAE%  FitTime(s)
ElasticNet()  0.278567  427.974012  102.418997  3.733564    11.13161


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      5
                             R²        RMSE        MAE      MAE%  FitTime(s)
KNeighborsRegressor()  0.455498  371.808447  93.292871  2.247549    0.848904


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    log2
                               R²       RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.619089  310.97899  62.600899  1.598604   76.026145


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   5
1          adaboostregressor__loss         exponential
                           R²        RMSE       MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.571762  329.732804  96.96846  4.540329   10.608624


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                         100
1          gradientboostingregressor__loss                       huber
                                  R²        RMSE        MAE      MAE%  \
GradientBoostingRegressor()  0.26244  432.731204  86.727787  2.199931   

                             FitTime(s)  
GradientBoostingRegressor()   47.235661  


### 1.1.2 Émissions au log

In [43]:
TotalGHGEmissions_train_log = np.log(TotalGHGEmissions_train)
TotalGHGEmissions_test_log = np.log(TotalGHGEmissions_test)


#### 1.1.2.1 Modèle LinearRegression

In [44]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train_log)

TotalGHGEmissions_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(TotalGHGEmissions_test_log,
                            TotalGHGEmissions_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(TotalGHGEmissions_test_log,
                                        TotalGHGEmissions_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_pred_logLR.squeeze(),
    y=TotalGHGEmissions_test_log.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_pred_logLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test_log=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()

r2 : 0.3436931006381113
rmse : 408.1999910653216


#### 1.1.2.2 Comparaison des modèles sur les émissions au log

In [45]:
paramlistEmissions_log = [{
    'ridge__alpha': np.logspace(3, 6, 100)
}, {
    'lasso__alpha': np.logspace(-2, 1, 100)
}, {
    'elasticnet__alpha': np.logspace(-1, 2, 10),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train_log,
                                        TotalGHGEmissions_test_log,
                                        'TotalGHGEmissions_log',
                                        paramlistEmissions_log, score,
                                        write_data, 'Emissions', '_log')


      paramètre      Ridge()
0  ridge__alpha  8697.490026
               R²        RMSE         MAE      MAE%  FitTime(s)
Ridge()  0.187929  510.775884  106.296686  1.885345    1.424896


      paramètre   Lasso()
0  lasso__alpha  0.327455
               R²        RMSE         MAE      MAE%  FitTime(s)
Lasso()  0.143055  503.699244  104.901725  1.994817    2.709946


              paramètre  ElasticNet()
0     elasticnet__alpha           1.0
1  elasticnet__l1_ratio           0.1
                    R²        RMSE         MAE      MAE%  FitTime(s)
ElasticNet()  0.182967  504.076517  105.305878  1.900049    0.961438


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²        RMSE        MAE      MAE%  FitTime(s)
KNeighborsRegressor()  0.594067  308.498237  56.671158  0.951037    0.818843


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    log2
                               R²        RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.682226  383.943359  65.911979  0.855877   62.820486


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   3
1          adaboostregressor__loss         exponential
                          R²        RMSE        MAE      MAE%  FitTime(s)
AdaBoostRegressor()  0.43159  438.303702  89.287461  1.407277   12.058541


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss               squared_error
                                   R²        RMSE        MAE      MAE%  \
GradientBoostingRegressor()  0.719778  341.359857  50.224669  0.908426   

                             FitTime(s)  
GradientBoostingRegressor()  497.820865  


In [46]:
EmissionsScores = pd.DataFrame().append(
    [val for key, val in ResultEmissions.items() if key.startswith('Score')])


In [47]:
EmissionsScoresLog = pd.DataFrame().append([
    val for key, val in ResultEmissions_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [48]:
EmissionsCompareScores = EmissionsScores.append(EmissionsScoresLog)
if write_data is True:
    EmissionsCompareScores.to_latex('./Tableaux/EmissionsScoresModèles.tex')
EmissionsCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.295493,422.92387,102.118478,3.521498,1.479223
Lasso(),0.279366,427.736951,102.370419,3.714799,1.686229
ElasticNet(),0.278567,427.974012,102.418997,3.733564,11.13161
KNeighborsRegressor(),0.455498,371.808447,93.292871,2.247549,0.848904
RandomForestRegressor(),0.619089,310.97899,62.600899,1.598604,76.026145
AdaBoostRegressor(),0.571762,329.732804,96.96846,4.540329,10.608624
GradientBoostingRegressor(),0.26244,432.731204,86.727787,2.199931,47.235661
Ridge()_log,0.187929,510.775884,106.296686,1.885345,1.424896
Lasso()_log,0.143055,503.699244,104.901725,1.994817,2.709946
ElasticNet()_log,0.182967,504.076517,105.305878,1.900049,0.961438


In [49]:
fig = make_subplots(len(EmissionsScores.columns),
                    2,
                    column_titles=("Émissions brutes", "Émissions log"),
                    row_titles=(EmissionsScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(EmissionsScores):
    fig.add_trace(go.Bar(x=EmissionsScores.index, y=EmissionsScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=EmissionsScoresLog.index, y=EmissionsScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(title_text="Comparaison des scores des modèles d'émissions",
                  showlegend=False,
                  height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScores.pdf', height=700)


Afin de voir si l'energy star score permet d'améliorer le modèle nous allons
voir si le meilleurs modèle est amélioré avec cette variable.
Je choisi d'utiliser le modèle RandomForestRegressor avec la variable au log
car c'est le modèle ayant le rapport erreur / temps de calcul le plus intéressant

In [50]:
BEBESSNum = pd.read_csv('BEBESSNum.csv')

BEBESSNumM = BEBESSNum.drop(
    columns={'SiteEnergyUse(kBtu)', 'TotalGHGEmissions'})
SiteEnergyUseESS = np.array(BEBESSNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissionsESS = np.array(BEBESSNum.TotalGHGEmissions).reshape(-1, 1)

BEBESSNumM_train, BEBESSNumM_test, TotalGHGEmissionsESS_train, TotalGHGEmissionsESS_test = train_test_split(
    BEBESSNumM, TotalGHGEmissionsESS, test_size=.2)

TotalGHGEmissionsESS_train_log = np.log(TotalGHGEmissionsESS_train)
TotalGHGEmissionsESS_test_log = np.log(TotalGHGEmissionsESS_test)


In [51]:
BestParamEmissionsRF_log = ResultEmissions_log[
    'BestParamRandomForestRegressor'].set_index('paramètre')
paramlistEmissionsESS_log = [{
    'randomforestregressor__n_estimators': [
        int(BestParamEmissionsRF_log.
            loc['randomforestregressor__n_estimators'].values)
    ],
    'randomforestregressor__max_features': [
        *BestParamEmissionsRF_log.loc[
            'randomforestregressor__max_features', :].values
    ]
}]
ResultEmissionsESS_log = compareGridModels([RandomForestRegressor()],
                                           scaler,
                                           BEBESSNumM_train,
                                           BEBESSNumM_test,
                                           TotalGHGEmissionsESS_train_log,
                                           TotalGHGEmissionsESS_test_log,
                                           'TotalGHGEmissionsESS_log',
                                           paramlistEmissionsESS_log,
                                           score,
                                           write_data=write_data,
                                           prefix='EmissionsESS',
                                           suffix='_log',
                                           plotfigRMSE=False)


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    log2
                              R²        RMSE        MAE      MAE%  FitTime(s)
RandomForestRegressor()  0.69585  214.547426  46.659052  0.757669    2.429149


In [52]:
EmissionsScoresLogESS = pd.DataFrame().append([
    val for key, val in ResultEmissionsESS_log.items()
    if key.startswith('Score')
]).rename('{}_logESS'.format)
CompareScoresESS = EmissionsScoresLog.append(EmissionsScoresLogESS).drop(
    columns=('FitTime(s)')).loc[[
        'RandomForestRegressor()_log', 'RandomForestRegressor()_logESS'
    ]]


In [53]:
fig = make_subplots(1,
                    len(CompareScoresESS.columns),
                    column_titles=(CompareScoresESS.columns.to_list()))
for c, col in enumerate(CompareScoresESS.columns):
    fig.add_trace(go.Bar(x=CompareScoresESS.index, y=CompareScoresESS[col]),
                  row=1,
                  col=c + 1)
fig.update_layout(
    title_text="Comparaison avec et sans ajout de l'energy score stars",
    showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScoresESS.pdf')


# 2. Modèle de prédiction sur la consommation énergétique (SiteEnergyUse)
## 2.1 Avec les données numériques uniquement
### 2.1.1 Consommation énergétique brute

In [54]:
BEBNumM_train, BEBNumM_test, SiteEnergyUse_train, SiteEnergyUse_test = train_test_split(
    BEBNumM, SiteEnergyUse, test_size=.2)


#### 2.1.1.1 Modèle LinearRegression

In [55]:
#modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train)

SiteEnergyUse_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(SiteEnergyUse_test, SiteEnergyUse_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(SiteEnergyUse_test,
                                    SiteEnergyUse_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_predLR.squeeze(),
    y=SiteEnergyUse_test.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_predLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.34562089403357044
rmse : 11516445.994541584


#### 2.1.1.2 Comparaison des modèles sur la consommation

In [56]:
paramlistConso = [{
    'ridge__alpha': np.logspace(-3, 5, 100)
}, {
    'lasso__alpha': np.logspace(0.1, 3, 100)
}, {
    'elasticnet__alpha': np.logspace(-3, 3, 200),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(2, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train,
                                SiteEnergyUse_test, 'SiteEnergyUse',
                                paramlistConso, score, write_data, 'Conso')


      paramètre     Ridge()
0  ridge__alpha  453.487851
               R²          RMSE           MAE     MAE%  FitTime(s)
Ridge()  0.397428  1.105117e+07  3.065690e+06  0.93686    1.510738


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre  Lasso()
0  lasso__alpha   1000.0
               R²          RMSE           MAE      MAE%  FitTime(s)
Lasso()  0.345757  1.151525e+07  3.368768e+06  1.032608    3.927762


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha      0.128989
1  elasticnet__l1_ratio      0.100000
                    R²          RMSE           MAE      MAE%  FitTime(s)
ElasticNet()  0.401728  1.101167e+07  3.031642e+06  0.927355   46.135672


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                     11
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.460575  1.045609e+07  2.729698e+06  0.832655   

                       FitTime(s)  
KNeighborsRegressor()    0.908688  


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     100
1  randomforestregressor__max_features                    log2
                               R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.693469  7.882081e+06  2.008165e+06  0.570979   

                         FitTime(s)  
RandomForestRegressor()   71.125097  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   4
1          adaboostregressor__loss              square
                           R²          RMSE           MAE      MAE%  \
AdaBoostRegressor()  0.244788  1.237195e+07  3.697199e+06  1.515518   

                     FitTime(s)  
AdaBoostRegressor()     9.95621  


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1000
1          gradientboostingregressor__loss                       huber
                                   R²          RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.613789  8.847404e+06  2.080499e+06  0.634642   

                             FitTime(s)  
GradientBoostingRegressor()  463.124956  


### 2.1.2 Consommation énergétique au log

In [57]:
SiteEnergyUse_train_log = np.log(SiteEnergyUse_train)
SiteEnergyUse_test_log = np.log(SiteEnergyUse_test)


#### 2.1.2.1 Modèle LinearRegression

In [58]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train_log)

SiteEnergyUse_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(SiteEnergyUse_test_log, SiteEnergyUse_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(SiteEnergyUse_test_log,
                                        SiteEnergyUse_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_pred_logLR.squeeze(),
    y=SiteEnergyUse_test_log.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_pred_logLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test_log=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.34562089403357044
rmse : 11516445.994541584


#### 2.1.2.2 Comparaison des modèles sur la consommation au log

In [59]:
paramlistConso_log = [{
    'ridge__alpha': np.logspace(3, 5, 100)
}, {
    'lasso__alpha': np.logspace(-1, 1, 100)
}, {
    'elasticnet__alpha': np.logspace(-1, 1, 100),
    'elasticnet__l1_ratio': np.linspace(0.1, 1, 6)
}, {
    'kneighborsregressor__n_neighbors':
    np.linspace(1, 100, dtype=int)
}, {
    'randomforestregressor__n_estimators':
    np.logspace(0, 3, 10, dtype=int),
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators':
    np.logspace(0, 2, 30, dtype=int),
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    np.logspace(1, 4, 5, dtype=int),
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultConso_log = compareGridModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], scaler, BEBNumM_train, BEBNumM_test, SiteEnergyUse_train_log,
                                    SiteEnergyUse_test_log,
                                    'SiteEnergyUse_log', paramlistConso_log,
                                    score, write_data, 'Conso', '_log')


      paramètre      Ridge()
0  ridge__alpha  7054.802311
               R²          RMSE           MAE      MAE%  FitTime(s)
Ridge()  0.356094  4.497285e+07  5.100358e+06  0.940923    1.577816


      paramètre   Lasso()
0  lasso__alpha  0.132194
               R²          RMSE           MAE     MAE%  FitTime(s)
Lasso()  0.360685  4.108465e+07  4.971449e+06  0.93604    1.653476


              paramètre  ElasticNet()
0     elasticnet__alpha      0.811131
1  elasticnet__l1_ratio      0.100000
                    R²          RMSE           MAE      MAE%  FitTime(s)
ElasticNet()  0.349983  3.403045e+07  4.642830e+06  0.942874     9.52443


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²          RMSE           MAE      MAE%  \
KNeighborsRegressor()  0.740067  1.035418e+07  1.675475e+06  0.421371   

                       FitTime(s)  
KNeighborsRegressor()    0.863157  


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                    1000
1  randomforestregressor__max_features                    log2
                               R²          RMSE           MAE      MAE%  \
RandomForestRegressor()  0.835988  8.796814e+06  1.742198e+06  0.340946   

                         FitTime(s)  
RandomForestRegressor()   63.285734  


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   9
1          adaboostregressor__loss              linear
                           R²          RMSE           MAE      MAE%  \
AdaBoostRegressor()  0.643284  1.137269e+07  2.602085e+06  0.576159   

                     FitTime(s)  
AdaBoostRegressor()   11.024621  


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss                       huber
                                   R²        RMSE           MAE      MAE%  \
GradientBoostingRegressor()  0.843084  6648407.18  1.414319e+06  0.303802   

                             FitTime(s)  
GradientBoostingRegressor()  384.811315  


In [60]:
ConsoScores = pd.DataFrame().append(
    [val for key, val in ResultConso.items() if key.startswith('Score')])


In [61]:
ConsoScoresLog = pd.DataFrame().append([
    val for key, val in ResultConso_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [62]:
ConsoCompareScores = ConsoScores.append(ConsoScoresLog)
if write_data is True:
    ConsoCompareScores.to_latex('./Tableaux/ConsoScoresModèles.tex')
ConsoCompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%,FitTime(s)
Ridge(),0.397428,11051170.0,3065690.0,0.93686,1.510738
Lasso(),0.345757,11515250.0,3368768.0,1.032608,3.927762
ElasticNet(),0.401728,11011670.0,3031642.0,0.927355,46.135672
KNeighborsRegressor(),0.460575,10456090.0,2729698.0,0.832655,0.908688
RandomForestRegressor(),0.693469,7882081.0,2008165.0,0.570979,71.125097
AdaBoostRegressor(),0.244788,12371950.0,3697199.0,1.515518,9.95621
GradientBoostingRegressor(),0.613789,8847404.0,2080499.0,0.634642,463.124956
Ridge()_log,0.356094,44972850.0,5100358.0,0.940923,1.577816
Lasso()_log,0.360685,41084650.0,4971449.0,0.93604,1.653476
ElasticNet()_log,0.349983,34030450.0,4642830.0,0.942874,9.52443


In [63]:
fig = make_subplots(len(ConsoScores.columns),
                    2,
                    column_titles=("Consommation brute", "Consommation log"),
                    row_titles=(ConsoScores.columns.to_list()),
                    shared_xaxes=True)
for r, c in enumerate(ConsoScores):
    fig.add_trace(go.Bar(x=ConsoScores.index, y=ConsoScores[c]),
                  row=r + 1,
                  col=1)
    fig.add_trace(go.Bar(x=ConsoScoresLog.index, y=ConsoScoresLog[c]),
                  row=r + 1,
                  col=2)
fig.update_layout(
    title_text="Comparaison des scores des modèles de consommation",
    showlegend=False,
    height=700)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoCompareScores.pdf', height=700)


Afin de voir si l'energy star score permet d'améliorer le modèle nous allons
voir si le meilleurs modèle est amélioré avec cette variable.
Je choisi d'utiliser le modèle RandomForestRegressor avec la variable brute
car c'est le modèle ayant le rapport erreur / temps de calcul le plus intéressant

In [64]:
BEBESSNumM_train, BEBESSNumM_test, SiteEnergyUseESS_train, SiteEnergyUseESS_test = train_test_split(
    BEBESSNumM, SiteEnergyUseESS, test_size=.2)

SiteEnergyUseESS_train = np.log(SiteEnergyUseESS_train)
SiteEnergyUseESS_test = np.log(SiteEnergyUseESS_test)


In [69]:
BestParamConsoGB = ResultConso[
    'BestParamRandomForestRegressor'].set_index('paramètre')
paramlistConsoESS = [{
    'randomforestregressor__n_estimators': [
        int(BestParamConsoGB.loc['randomforestregressor__n_estimators'].
            values)
    ],
    'randomforestregressor__max_features':
    [*BestParamConsoGB.loc['randomforestregressor__max_features', :].values]
}]
ResultConsoESS = compareGridModels([RandomForestRegressor()],
                                   scaler,
                                   BEBESSNumM_train,
                                   BEBESSNumM_test,
                                   SiteEnergyUseESS_train,
                                   SiteEnergyUseESS_test,
                                   'SiteEnergyUseESS',
                                   paramlistConsoESS,
                                   score,
                                   write_data=write_data,
                                   prefix='ConsoESS',
                                   suffix='_ESS',
                                   plotfigRMSE=False)


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     100
1  randomforestregressor__max_features                    log2
                               R²      RMSE       MAE     MAE%  FitTime(s)
RandomForestRegressor()  0.873911  0.382414  0.268092  0.01872    0.658074


In [70]:
ConsoScoresESS = pd.DataFrame().append([
    val for key, val in ResultConsoESS.items() if key.startswith('Score')
]).rename('{}_ESS'.format)
CompareConsoScoresESS = ConsoScores.append(ConsoScoresESS).drop(
    columns=('FitTime(s)')).loc[[
        'RandomForestRegressor()', 'RandomForestRegressor()_ESS'
    ]]


In [71]:
fig = make_subplots(1,
                    len(CompareScoresESS.columns),
                    column_titles=(CompareScoresESS.columns.to_list()),
                    horizontal_spacing=.1)
for c, col in enumerate(CompareScoresESS.columns):
    fig.add_trace(go.Bar(x=CompareScoresESS.index, y=CompareScoresESS[col]),
                  row=1,
                  col=c + 1)
fig.update_layout(
    title_text="Comparaison avec et sans ajout de l'energy score stars",
    showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoCompareScoresESS.pdf')
