In [1]:
import os
import pandas as pd

pd.options.plotting.backend = 'plotly'
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, \
                             GradientBoostingRegressor

from Pélec_04_fonctions import reg_modelGrid, visuRMSEGrid, compareModels


In [2]:
write_data = True

if write_data is True:
    try:
        os.mkdir("./Figures/")
    except OSError as error:
        print(error)
    try:
        os.mkdir("./Tableaux/")
    except OSError as error:
        print(error)
else:
    print("""Visualisation uniquement dans le notebook
    pas de création de figures ni de tableaux""")


[Errno 17] File exists: './Figures/'
[Errno 17] File exists: './Tableaux/'


In [3]:
BEBNum = pd.read_csv('BEBNum.csv')

BEBNumM = BEBNum.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBNum['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBNum.TotalGHGEmissions).reshape(-1, 1)

BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train, TotalGHGEmissions_test = train_test_split(
    BEBNumM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


In [4]:
# Scaler moins sensible aux outlier d'après la doc
scaler = RobustScaler(quantile_range=(10, 90))


# 1. Modèle de prédiction sur les émissions (TotalGHGEmissions)
## 1.1 Avec les données numériques uniquement
### 1.1.1 Émissions brutes

#### 1.1.1.1 Modèle LinearRegression

In [5]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train)

TotalGHGEmissions_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(TotalGHGEmissions_test, TotalGHGEmissions_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(TotalGHGEmissions_test,
                                    TotalGHGEmissions_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_predLR.squeeze(),
    y=TotalGHGEmissions_test.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_predLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()


r2 : 0.34345539988401874
rmse : 319.37459095409235


#### 1.1.1.2 Comparaison de différents modèles sur les émissions brutes

In [6]:
# paramètre Ridge
alphasridge = np.logspace(1, 5, 100)
# paramètre Lasso
alphaslasso = np.logspace(0, 3, 100)
# paramètre ElasticNet
alphasEN = np.logspace(0, 3, 100)
l1ratioEN = np.linspace(0.1, 1, 6)
# paramètre kNN
n_neighbors = np.linspace(1, 100, dtype=int)
# paramètre RandomForest
n_estimatorsRF = np.logspace(0, 3, 10, dtype=int)
# paramètre AdaBoost
n_estimatorsAB = np.logspace(0, 2, 30, dtype=int)
# paramètre GradientBoost
n_estimatorsGB = np.logspace(0, 3, 5, dtype=int)
paramlist = [{
    'ridge__alpha': alphasridge
}, {
    'lasso__alpha': alphaslasso
}, {
    'elasticnet__alpha': alphasEN,
    'elasticnet__l1_ratio': l1ratioEN
}, {
    'kneighborsregressor__n_neighbors': n_neighbors
}, {
    'randomforestregressor__n_estimators': n_estimatorsRF,
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators': n_estimatorsAB,
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    n_estimatorsGB,
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]
ResultEmissions = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train,
                                TotalGHGEmissions_test, 'TotalGHGEmissions',
                                paramlist, score, write_data, 'Emissions')


      paramètre       Ridge()
0  ridge__alpha  22570.197196
               R²        RMSE        MAE      MAE%
Ridge()  0.360642  315.166768  94.636704  4.998707


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre     Lasso()
0  lasso__alpha  162.975083
               R²        RMSE        MAE      MAE%
Lasso()  0.374857  311.643481  94.077156  5.087585


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha    162.975083
1  elasticnet__l1_ratio      1.000000
                    R²        RMSE        MAE      MAE%
ElasticNet()  0.374857  311.643481  94.077156  5.087585


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      3
                             R²        RMSE        MAE      MAE%
KNeighborsRegressor()  0.291405  331.793034  91.090212  2.916486


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    auto
                               R²        RMSE        MAE      MAE%
RandomForestRegressor()  0.620998  242.654949  63.626703  2.101126


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   3
1          adaboostregressor__loss         exponential
                          R²        RMSE         MAE      MAE%
AdaBoostRegressor()  0.23274  345.254571  103.137983  5.287356


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1000
1          gradientboostingregressor__loss               squared_error
                                   R²        RMSE        MAE      MAE%
GradientBoostingRegressor()  0.750954  196.701713  64.661492  2.971707


### 1.1.2 Émissions au log

In [7]:
TotalGHGEmissions_train_log = np.log(TotalGHGEmissions_train)
TotalGHGEmissions_test_log = np.log(TotalGHGEmissions_test)


#### 1.1.2.1 Modèle LinearRegression

In [8]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, TotalGHGEmissions_train_log)

TotalGHGEmissions_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(TotalGHGEmissions_test_log,
                            TotalGHGEmissions_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(TotalGHGEmissions_test_log,
                                        TotalGHGEmissions_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=TotalGHGEmissions_pred_logLR.squeeze(),
    y=TotalGHGEmissions_test_log.squeeze(),
    labels={
        'x': f'{TotalGHGEmissions_pred_logLR=}'.partition('=')[0],
        'y': f'{TotalGHGEmissions_test_log=}'.partition('=')[0]
    },
    title=
    "Visualisation des données d'émissions prédites par le modèle de régression linéaire<br>vs les données test"
)
fig.show()

r2 : 0.34345539988401874
rmse : 319.37459095409235


#### 1.1.2.2 Comparaison des modèles sur les émissions au log

In [9]:
alphasridge_log = np.logspace(3, 6, 100)

alphaslasso_log = np.logspace(-2, 1, 100)

alphasEN_log = np.logspace(0, 2, 10)
l1ratioEN_log = np.linspace(0.1, 1, 6)

n_neighbors_log = np.linspace(1, 100, dtype=int)

n_estimatorsRF_log = np.logspace(0, 3, 10, dtype=int)

n_estimatorsAB_log = np.logspace(0, 2, 30, dtype=int)

n_estimatorsGB_log = np.logspace(0, 4, 5, dtype=int)

paramlist_log = [{
    'ridge__alpha': alphasridge_log
}, {
    'lasso__alpha': alphaslasso_log
}, {
    'elasticnet__alpha': alphasEN_log,
    'elasticnet__l1_ratio': l1ratioEN_log
}, {
    'kneighborsregressor__n_neighbors': n_neighbors_log
}, {
    'randomforestregressor__n_estimators':
    n_estimatorsRF_log,
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
}, {
    'adaboostregressor__n_estimators': n_estimatorsAB_log,
    'adaboostregressor__loss': ['linear', 'square', 'exponential']
}, {
    'gradientboostingregressor__n_estimators':
    n_estimatorsGB_log,
    'gradientboostingregressor__loss':
    ['squared_error', 'absolute_error', 'huber', 'quantile']
}]

ResultEmissions_log = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, TotalGHGEmissions_train_log,
                                    TotalGHGEmissions_test_log,
                                    'TotalGHGEmissions_log', paramlist_log,
                                    score, write_data, 'Emissions', '_log')


      paramètre       Ridge()
0  ridge__alpha  70548.023107
               R²        RMSE        MAE      MAE%
Ridge()  0.191699  387.100693  92.997449  2.357548


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre   Lasso()
0  lasso__alpha  0.756463
               R²        RMSE        MAE      MAE%
Lasso()  0.159186  394.431198  94.998577  2.415299


              paramètre  ElasticNet()
0     elasticnet__alpha      4.641589
1  elasticnet__l1_ratio      0.100000
                    R²        RMSE        MAE      MAE%
ElasticNet()  0.170956  391.121835  94.283851  2.393138


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                     37
                             R²        RMSE        MAE     MAE%
KNeighborsRegressor()  0.428833  334.938437  79.646131  1.68825


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     464
1  randomforestregressor__max_features                    log2
                               R²        RMSE        MAE     MAE%
RandomForestRegressor()  0.723608  207.487262  53.787911  1.03762


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   4
1          adaboostregressor__loss         exponential
                           R²       RMSE        MAE      MAE%
AdaBoostRegressor()  0.429842  336.64016  79.677976  1.752035


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss               squared_error
                                   R²        RMSE        MAE      MAE%
GradientBoostingRegressor()  0.752765  178.055289  42.647967  0.911515


In [10]:
Scores = pd.DataFrame().append(
    [val for key, val in ResultEmissions.items() if key.startswith('Score')])


In [11]:
ScoresLog = pd.DataFrame().append([
    val for key, val in ResultEmissions_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [12]:
CompareScores = Scores.append(ScoresLog)
if write_data is True:
    CompareScores.to_latex('./Tableaux/EmissionsScoresModèles.tex')
CompareScores


Unnamed: 0,R²,RMSE,MAE,MAE%
Ridge(),0.360642,315.166768,94.636704,4.998707
Lasso(),0.374857,311.643481,94.077156,5.087585
ElasticNet(),0.374857,311.643481,94.077156,5.087585
KNeighborsRegressor(),0.291405,331.793034,91.090212,2.916486
RandomForestRegressor(),0.620998,242.654949,63.626703,2.101126
AdaBoostRegressor(),0.23274,345.254571,103.137983,5.287356
GradientBoostingRegressor(),0.750954,196.701713,64.661492,2.971707
Ridge()_log,0.191699,387.100693,92.997449,2.357548
Lasso()_log,0.159186,394.431198,94.998577,2.415299
ElasticNet()_log,0.170956,391.121835,94.283851,2.393138


In [29]:
fig = make_subplots(4,
                    2,
                    column_titles=("Émissions brutes", "Émissions log"),
                    row_titles=('R²', 'RMSE', 'MAE', 'MAE%'),
                    shared_xaxes=True)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['R²']), row=1, col=1)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['RMSE']), row=2, col=1)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['MAE']), row=3, col=1)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['MAE%']), row=4, col=1)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['R²']), row=1, col=2)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['RMSE']), row=2, col=2)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['MAE']), row=3, col=2)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['MAE%']), row=4, col=2)
fig.update_layout(title_text="Comparaison des scores des modèles d'émissions",
                  showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/EmissionsCompareScores.pdf', height=600)


## 1.2 Avec les données catégorielles

In [14]:
BEBCat = pd.read_csv('BEBCat.csv')

BEBCatM = BEBCat.drop(columns=['SiteEnergyUse(kBtu)', 'TotalGHGEmissions'])
SiteEnergyUse = np.array(BEBCat['SiteEnergyUse(kBtu)']).reshape(-1, 1)
TotalGHGEmissions = np.array(BEBCat.TotalGHGEmissions).reshape(-1, 1)

BEBCatM_train, BEBCatM_test, TotalGHGEmissionsCat_train, TotalGHGEmissionsCat_test = train_test_split(
    BEBCatM, TotalGHGEmissions, test_size=.2)

score = 'neg_root_mean_squared_error'


# 2. Modèle de prédiction sur la consommation énergétique (SiteEnergyUse)
## 2.1 Avec les données numériques uniquement
### 2.1.1 Consommation énergétique brute

In [15]:
BEBNumM_train, BEBNumM_test, SiteEnergyUse_train, SiteEnergyUse_test = train_test_split(
    BEBNumM, SiteEnergyUse, test_size=.2)


#### 2.1.1.1 Modèle LinearRegression

In [16]:
#modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train)

SiteEnergyUse_predLR = pipeLR.predict(BEBNumM_test)

LRr2 = metrics.r2_score(SiteEnergyUse_test, SiteEnergyUse_predLR)
print("r2 :", LRr2)
LRrmse = metrics.mean_squared_error(SiteEnergyUse_test,
                                    SiteEnergyUse_predLR,
                                    squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_predLR.squeeze(),
    y=SiteEnergyUse_test.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_predLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.4786442763516887
rmse : 12184262.045657529


#### 2.1.1.2 Comparaison des modèles sur la consommation

In [17]:

alphasridge = np.logspace(-3, 5, 1000)

alphaslasso = np.linspace(0.1, 1, 5)

alphasEN = np.logspace(-3, 3, 200)
l1ratioEN = np.linspace(0, 1, 6)

n_neighbors = np.linspace(1, 100, dtype=int)

n_estimatorsRF = np.logspace(0, 3, 10, dtype=int)

n_estimatorsAB = np.logspace(0, 2, 30, dtype=int)

n_estimatorsGB = np.logspace(1, 3, 10, dtype=int)

ResultConso = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, SiteEnergyUse_train,
                            SiteEnergyUse_test, 'SiteEnergyUse', paramlist,
                            score, write_data, 'Conso')


      paramètre     Ridge()
0  ridge__alpha  869.749003
               R²          RMSE           MAE      MAE%
Ridge()  0.475832  1.221708e+07  3.252530e+06  1.198511


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre  Lasso()
0  lasso__alpha   1000.0
               R²          RMSE           MAE      MAE%
Lasso()  0.476821  1.220555e+07  3.341978e+06  1.254209


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

              paramètre  ElasticNet()
0     elasticnet__alpha      1.149757
1  elasticnet__l1_ratio      0.820000
                    R²          RMSE           MAE      MAE%
ElasticNet()  0.475329  1.222294e+07  3.242415e+06  1.189816


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      5
                             R²          RMSE           MAE      MAE%
KNeighborsRegressor()  0.414795  1.290880e+07  3.327107e+06  1.286675


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                      10
1  randomforestregressor__max_features                    log2
                              R²          RMSE           MAE      MAE%
RandomForestRegressor()  0.72604  8.832334e+06  2.036158e+06  0.616399


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   9
1          adaboostregressor__loss              square
                           R²          RMSE           MAE     MAE%
AdaBoostRegressor()  0.597675  1.070337e+07  3.988286e+06  2.71513


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                        1000
1          gradientboostingregressor__loss                       huber
                                   R²          RMSE           MAE      MAE%
GradientBoostingRegressor()  0.500177  1.193000e+07  2.318631e+06  0.718496


### 2.1.2 Consommation énergétique au log

In [18]:
SiteEnergyUse_train_log = np.log(SiteEnergyUse_train)
SiteEnergyUse_test_log = np.log(SiteEnergyUse_test)


#### 2.1.2.1 Modèle LinearRegression

In [19]:
# modèle régression linéaire
pipeLR = make_pipeline(scaler, LinearRegression())

pipeLR.fit(BEBNumM_train, SiteEnergyUse_train_log)

SiteEnergyUse_pred_logLR = pipeLR.predict(BEBNumM_test)

LRr2_log = metrics.r2_score(SiteEnergyUse_test_log, SiteEnergyUse_pred_logLR)
print("r2 :", LRr2)
LRrmse_log = metrics.mean_squared_error(SiteEnergyUse_test_log,
                                        SiteEnergyUse_pred_logLR,
                                        squared=False)
print("rmse :", LRrmse)

fig = px.scatter(
    x=SiteEnergyUse_pred_logLR.squeeze(),
    y=SiteEnergyUse_test_log.squeeze(),
    labels={
        'x': f'{SiteEnergyUse_pred_logLR=}'.partition('=')[0],
        'y': f'{SiteEnergyUse_test_log=}'.partition('=')[0]
    },
    title=
    'Visualisation des données de consommation prédites par le modèle de régression linéaire<br>vs les données test'
)
fig.show()

r2 : 0.4786442763516887
rmse : 12184262.045657529


#### 2.1.2.2 Comparaison des modèles sur la consommation au log

In [20]:

alphasridge_log = np.logspace(-3, 5, 1000)

alphaslasso_log = np.linspace(0.1, 1, 5)

alphasEN_log = np.logspace(-1, 3, 200)
l1ratioEN_log = np.linspace(0, 1, 6)

n_neighbors_log = np.linspace(1, 100, dtype=int)

n_estimatorsRF_log = np.logspace(0, 3, 10, dtype=int)

n_estimatorsAB_log = np.logspace(0, 2, 30, dtype=int)

n_estimatorsGB_log = np.logspace(1, 4, 10, dtype=int)

ResultConso_log = compareModels([
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
], RobustScaler(), BEBNumM_train, BEBNumM_test, SiteEnergyUse_train_log,
                                SiteEnergyUse_test_log, 'SiteEnergyUse_log',
                                paramlist_log, score, write_data, 'Conso',
                                '_log')


      paramètre       Ridge()
0  ridge__alpha  70548.023107
               R²          RMSE           MAE     MAE%
Ridge()  0.300531  8.472181e+07  7.420620e+06  1.13701


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

      paramètre   Lasso()
0  lasso__alpha  0.756463
               R²          RMSE           MAE      MAE%
Lasso()  0.256702  7.676353e+07  6.835905e+06  1.170138


              paramètre  ElasticNet()
0     elasticnet__alpha      4.641589
1  elasticnet__l1_ratio      0.100000
                    R²          RMSE           MAE      MAE%
ElasticNet()  0.272732  7.576942e+07  6.950508e+06  1.158143


                          paramètre  KNeighborsRegressor()
0  kneighborsregressor__n_neighbors                      1
                             R²          RMSE           MAE      MAE%
KNeighborsRegressor()  0.735726  1.144424e+07  2.005333e+06  0.471624


                             paramètre RandomForestRegressor()
0  randomforestregressor__n_estimators                     215
1  randomforestregressor__max_features                    sqrt
                               R²          RMSE           MAE      MAE%
RandomForestRegressor()  0.819452  1.121892e+07  2.010972e+06  0.445842


                         paramètre AdaBoostRegressor()
0  adaboostregressor__n_estimators                   7
1          adaboostregressor__loss         exponential
                           R²          RMSE           MAE      MAE%
AdaBoostRegressor()  0.614238  1.306505e+07  2.884004e+06  0.799099


                                 paramètre GradientBoostingRegressor()
0  gradientboostingregressor__n_estimators                       10000
1          gradientboostingregressor__loss                       huber
                                   R²          RMSE           MAE      MAE%
GradientBoostingRegressor()  0.813469  1.038292e+07  1.614394e+06  0.373911


In [21]:
Scores = pd.DataFrame().append(
    [val for key, val in ResultConso.items() if key.startswith('Score')])


In [22]:
ScoresLog = pd.DataFrame().append([
    val for key, val in ResultConso_log.items() if key.startswith('Score')
]).rename('{}_log'.format)


In [28]:
fig = make_subplots(4,
                    2,
                    column_titles=("Consommation brute", "Consommation log2"),
                    row_titles=('R²', 'RMSE', 'MAE', 'MAE%'),
                    shared_xaxes=True)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['R²']), row=1, col=1)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['RMSE']), row=2, col=1)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['MAE']), row=3, col=1)
fig.add_trace(go.Bar(x=Scores.index, y=Scores['MAE%']), row=4, col=1)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['R²']), row=1, col=2)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['RMSE']), row=2, col=2)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['MAE']), row=3, col=2)
fig.add_trace(go.Bar(x=ScoresLog.index, y=ScoresLog['MAE%']), row=4, col=2)
fig.update_layout(
    title_text="Comparaison des scores des modèles de consommation",
    showlegend=False)
fig.show()
if write_data is True:
    fig.write_image('./Figures/ConsoCompareScores.pdf', height=600)