# Modélisation Seattle 
## OpenClassrooms Projet 4 - Hugo REBEIX

<img src="presentation/img/logo_seattle.jpg" style="height:100px">

# Sommaire

### [Bibliothèques](#1_bibli)
### [Fonctions](#1_funcs)
### [Données](#1_donnees)
### [Cibles](#1_tragets)
### [Préparation GridSearch](#1_GS_params)

### [Entrainement](#1_training)
- [Cibles Brutes](#2_raw_targets)
- [Cibles Normalisées](#2_norm_targets)
- [Conclusions](#2_conclusions_train)

### [Refit de XGBoost](#1_refit)
- [Avec EnergyStarScore](#2_refit_ESS)
- [Sans EnergyStarScore](#2_refit_no_ESS)
- [Conclusions du Refit](#2_refit_conclusions)

### [Temps d'entrainement](#1_train_time)

### [Conclusion générale](#1_conclusion)

<a id='1_bibli'></a>

# Import de bibliothèques 📚

In [3]:
import pandas as pd
import numpy as np
import math

import os
import time
import json
import copy

from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, VotingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, pairwise, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

import mlflow
# import lightgbm as lgb # BUG IMPORT  --> importer avec sklearn

<a id='1_funcs'></a>

# Fonctions ⚙️

In [4]:
def get_naive_baseline(target):
    dum_rgr = DummyRegressor(strategy="mean")
    
    dum_rgr.fit(data_train.drop(columns=[target]), data_train[target])
    y_pred = dum_rgr.predict(data_test.drop(columns=[target]))
    
    r2 = r2_score(data_test[target], y_pred)
    mse = mean_squared_error(data_test[target], y_pred)
    #Score R²
    print("Score R² du dummy Regressor:\n{}\n".format(round(r2, 4)))
    #Mean Squared Error
    print("Root Mean Squared Error du dummy Regressor:\n{}\n".format(math.sqrt(mse)))
    dummy_scores = {"r2_score": r2, "mean_squared_error": mse}
    return dummy_scores

In [5]:
def launch_models_CV(
                    data_train,
                    target,
                    models_params, 
                    default_mode=False, 
                    folds=5, 
                    scorings='neg_root_mean_squared_error',
                    favorite_scoring=None,
                    comment=None,
                    n_jobs=-1):

    # Iterate on model types
    for model_type in models_params.keys():
        # choose target
        if 'SiteEnergyUse(kBtu)' in target:
            mlflow.set_experiment(model_type + '_' + 'energy')
        if 'TotalGHGEmissions' in target:
            mlflow.set_experiment(model_type + '_' + 'emissions')
        
        with mlflow.start_run(run_name=comment):
            
            #Prepare training
            estimator = models_params[model_type]['estimator']
            
            if not default_mode:
                
                #Get parameters grid
                
                params = models_params[model_type]['params']
                regressor = GridSearchCV(
                    estimator=estimator, 
                    param_grid=params, 
                    scoring=scorings,
                    n_jobs=n_jobs,
                    cv=folds,
                    refit=favorite_scoring
                    )

                #Saving gridsearch params in MlFlow Artifact
                    #Deleting estimators object from the artifact (json incompatible)
                models_params_artifact = copy.deepcopy(models_params)
                for model_type_artifact in models_params_artifact.keys():
                    del models_params_artifact[model_type_artifact]['estimator']
                with open('models_params.json', 'w') as fp:
                    json.dump(models_params_artifact, fp)
                
                mlflow.log_artifact('models_params.json')

                #Training
                print('Start training of {} with GridSearch and CV'.format(model_type))
                start = time.time()
                
                regressor.fit(
                    data_train.drop(columns=[target]),
                    data_train[target])
                
                elapsed = time.time() - start
                
                best_estimator = regressor.best_estimator_
                results = pd.DataFrame(regressor.cv_results_)
                
                mlflow.sklearn.log_model(best_estimator, "best_model")
                
                # Get results
                results_best_estimator = results[results['rank_test_{}'.format(favorite_scoring)] == 1]
                for scoring in scorings:
                    mlflow.log_metric(scoring.strip('neg_'), abs(results_best_estimator['mean_test_{}'.format(scoring)].values[0]))
                    print(scoring.strip('neg_') + ' : ', abs(results_best_estimator['mean_test_{}'.format(scoring)].values[0]))
                mlflow.log_metric('training_time', elapsed)
                

                for param, value in regressor.best_params_.items():
                    mlflow.log_param(param, value)
                mlflow.log_param("folds", folds)
                
                print('Successful training and logging of {}, in {} seconds\n\n'.format(model_type, round(elapsed)))
                 
                
            # Default mode
            else:
                print('Start training of {} (default mode)'.format(model_type))
                start = time.time()
                results = cross_validate(
                    estimator,
                    data_train.drop(columns=[target]),
                    data_train[target],
                    scoring=scorings,
                    cv=folds,
                    n_jobs=n_jobs,
                    return_estimator=True
                    )
                elapsed = time.time() - start
                
                for scoring in scorings:
                    print(scoring.strip('neg_'), abs(results['test_{}'.format(scoring)].mean()))
                    mlflow.log_metric(scoring.strip('neg_'), abs(results['test_{}'.format(scoring)].mean()))
                mlflow.log_metric('training_time', elapsed)
                
                for param, value in results['estimator'][0].get_params().items():
                    if param in models_params[model_type]['params'].keys():
                        mlflow.log_param(param, value)
                mlflow.log_param("folds", folds)
                print('Successful training and logging of {}, in {} seconds (Default Mode)\n\n'.format(model_type, round(elapsed)))
                
            mlflow.end_run()

<a id='1_donnees'></a>

# Données 🎁

In [6]:
data = pd.read_csv('data/data_full_post_analisis.csv')
energyScore = data.ENERGYSTARScore # Nous traiterons cette données plus tard

data.drop(columns=['Unnamed: 0', 'CouncilDistrictCode', 'DataYear'], inplace=True)
print("Sans ESS : ", data.shape)
data.dropna(inplace=True)
print("Avec ESS + dropna", data.shape)
data.set_index('OSEBuildingID', inplace=True)


data = data[list(data.dtypes[data.dtypes != "object"].index)]

Sans ESS :  (6509, 98)
Avec ESS + dropna (4959, 98)


<a id='1_targets'></a>

## Cibles 🎯
- SiteEnergyUse(kBtu)
- TotalGHGEmissions

In [7]:
target_energy = 'SiteEnergyUse(kBtu)'
target_emissions = 'TotalGHGEmissions'

target_energy_norm = target_energy + 'Norm'
target_emissions_norm = target_emissions + 'Norm'

# Attention retirer colonne NORM, faire deux DF un avec norm et l'autre sans


#### Retirons les colonnes qui sont équivalentes aux targets:
"Votre prédiction se basera sur les données déclaratives du permis d'exploitation commerciale (taille et usage des bâtiments, mention de travaux récents, date de construction..)"

In [8]:
data.drop(columns=['SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)',
       'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 
       'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)',
       'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)',
       'GHGEmissionsIntensity'], inplace=True)

<a id='1_GS_params'></a>

# Grid Search Params 🔎

In [9]:
models_params = {  
    'random_forest': {
        'params': {
           'max_depth' : [5,10,20],
           'max_features' : ['auto', 'log2', 'sqrt'],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'random_state': [41]
           },
        'estimator': RandomForestRegressor(),
    },
                        
    'gradient_boosting': {
         'params': {
             'max_depth' : [5,10,20],
             'learning_rate' : [0.05, 0.1, 0.2],
             'n_iter_no_change': [None, 10],
             'random_state': [41]
             },
         'estimator': GradientBoostingRegressor()
    },
                 
                 
    'xgboost': {
        'params':{
            'max_depth' : [5,10,20],
            'learning_rate' : [0.05, 0.1, 0.2],
            'reg_alpha': [0.1, 0.01, 1],
            'min_child_weight': [1,5,10],
            'random_state': [41]
            },
        'estimator': XGBRegressor()
    },

   'lasso': {
       'params': {
           'alpha': [0.001, 0.01, 0.1],
           'random_state': [41],
           'max_iter': [10000]
           },
       'estimator' : Lasso()
    },
    
    'elastic_net': {
        'params': {
            'alpha' : [0.0001, 0.001, 0.01, 0.1],
            'l1_ratio' : [0.5, 0.8, 0.2],
            'max_iter': [10000]
            },
        'estimator': ElasticNet()
    },
                 
    'kernel_ridge': {
        'params': {
            'alpha': [0.001, 0.01, 0.1],
            'kernel': ['chi2', 'linear', 'polynomial'],
            "degree": [2,3,4]
            },
        'estimator': KernelRidge()
    },
    
    'svr':{
        'params':{
            "kernel":['linear', 'poly', 'rbf', 'sigmoid'],
            "C":[0.1, 1, 10],
            'epsilon':[0.05,0.1,0.3],
            'max_iter': [10000]
        },
        'estimator': SVR()
    }
}

<a id='1_training'></a>

# Entrainement 🏋️‍♀️

Stratégie : 

<img src="plots_storage/strategie.png" style="height:250px">

<a id='2_raw_targets'></a>

## Entrainement avec les cibles brutes 🍣

In [10]:
data_raw_targets = data.drop(columns=[target_energy_norm, target_emissions_norm])

In [11]:
data_raw_targets_energy = data_raw_targets.drop(columns=[target_emissions])
data_train, data_test = train_test_split(data_raw_targets_energy, train_size=0.7, random_state=41)

### Energy ⚡

#### Baseline Naïve

In [12]:
dummy_scores_energy_raw = get_naive_baseline(target_energy)

Score R² du dummy Regressor:
-0.0011

Root Mean Squared Error du dummy Regressor:
8022984.60006515



#### GridSearch

In [11]:
launch_models_CV(
    data_train,
    target_energy,
    models_params, 
    default_mode=False,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Grid Search CV norm Targets',
    n_jobs=-1)

Start training of random_forest with GridSearch and CV
root_mean_squared_error :  3787355.5709763537
r2 :  0.8284348437379956
Successful training and logging of random_forest, in 38 seconds


Start training of gradient_boosting with GridSearch and CV
root_mean_squared_error :  3496545.8699609437
r2 :  0.8540625209779599
Successful training and logging of gradient_boosting, in 29 seconds


Start training of xgboost with GridSearch and CV
root_mean_squared_error :  3505387.6844208813
r2 :  0.8537859196842699
Successful training and logging of xgboost, in 150 seconds


Start training of lasso with GridSearch and CV


  model = cd_fast.enet_coordinate_descent(


root_mean_squared_error :  4766104.170804801
r2 :  0.7025370207744152
Successful training and logging of lasso, in 11 seconds


Start training of elastic_net with GridSearch and CV


  model = cd_fast.enet_coordinate_descent(


root_mean_squared_error :  4673807.433483823
r2 :  0.7246551277955566
Successful training and logging of elastic_net, in 30 seconds


Start training of kernel_ridge with GridSearch and CV




root_mean_squared_error :  3806138.619182693
r2 :  0.8173761272855532
Successful training and logging of kernel_ridge, in 177 seconds


Start training of svr with GridSearch and CV
root_mean_squared_error :  7988136.133380897
r2 :  0.23209583628043437
Successful training and logging of svr, in 18 seconds






#### Défaut CV

In [12]:
launch_models_CV(
    data_train,
    target_energy,
    models_params, 
    default_mode=True,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Default mode CV Raw Targets',
    n_jobs=-1)

Start training of random_forest (default mode)
root_mean_squared_error 3707687.8416564264
r2 0.8363947936983442
Successful training and logging of random_forest, in 3 seconds (Default Mode)


Start training of gradient_boosting (default mode)
root_mean_squared_error 3554097.954457966
r2 0.8481204641631255
Successful training and logging of gradient_boosting, in 1 seconds (Default Mode)


Start training of xgboost (default mode)
root_mean_squared_error 3764017.4085265566
r2 0.8307418306464806
Successful training and logging of xgboost, in 1 seconds (Default Mode)


Start training of lasso (default mode)
root_mean_squared_error 4766091.7328449795
r2 0.7025387915423122
Successful training and logging of lasso, in 0 seconds (Default Mode)


Start training of elastic_net (default mode)
root_mean_squared_error 5575384.764665131
r2 0.6209635045615136
Successful training and logging of elastic_net, in 0 seconds (Default Mode)


Start training of kernel_ridge (default mode)
root_mean_squared_er

### Emissions 🌋

In [14]:
data_raw_targets_emissions = data_raw_targets.drop(columns=[target_energy])
data_train, data_test = train_test_split(data_raw_targets_emissions, train_size=0.7, random_state=41)

#### Baseline Naïve

In [15]:
dummy_scores_emissions_raw = get_naive_baseline(target_emissions)

Score R² du dummy Regressor:
-0.0058

Root Mean Squared Error du dummy Regressor:
175.21735155185456



#### GridSearch

In [15]:
launch_models_CV(
    data_train,
    target_emissions,
    models_params, 
    default_mode=False,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Grid Search CV Raw Targets ESS',
    n_jobs=-1)

Start training of random_forest with GridSearch and CV
root_mean_squared_error :  141.57169318735612
r2 :  0.699660239264849
Successful training and logging of random_forest, in 43 seconds


Start training of gradient_boosting with GridSearch and CV
root_mean_squared_error :  154.00215877057698
r2 :  0.5956488843542401
Successful training and logging of gradient_boosting, in 27 seconds


Start training of xgboost with GridSearch and CV
root_mean_squared_error :  135.92599176090337
r2 :  0.7065354486670213
Successful training and logging of xgboost, in 133 seconds


Start training of lasso with GridSearch and CV
root_mean_squared_error :  184.3265073319852
r2 :  0.42644813487867905
Successful training and logging of lasso, in 3 seconds


Start training of elastic_net with GridSearch and CV
root_mean_squared_error :  181.27851281490226
r2 :  0.4678954107372995
Successful training and logging of elastic_net, in 12 seconds


Start training of kernel_ridge with GridSearch and CV




root_mean_squared_error :  144.3332900582778
r2 :  0.6652289681933299
Successful training and logging of kernel_ridge, in 168 seconds


Start training of svr with GridSearch and CV
root_mean_squared_error :  236.92627738306294
r2 :  0.15962371622620738
Successful training and logging of svr, in 17 seconds




#### Défaut CV

In [16]:
launch_models_CV(
    data_train,
    target_emissions,
    models_params, 
    default_mode=True,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Default mode CV Raw Targets ESS',
    n_jobs=-1)

Start training of random_forest (default mode)
root_mean_squared_error 149.28092144080696
r2 0.6359618074687896
Successful training and logging of random_forest, in 3 seconds (Default Mode)


Start training of gradient_boosting (default mode)
root_mean_squared_error 142.21110701400053
r2 0.6585727818279444
Successful training and logging of gradient_boosting, in 1 seconds (Default Mode)


Start training of xgboost (default mode)
root_mean_squared_error 149.77176438618542
r2 0.6355819661111831
Successful training and logging of xgboost, in 1 seconds (Default Mode)


Start training of lasso (default mode)
root_mean_squared_error 184.02126852526115
r2 0.44452773509552507
Successful training and logging of lasso, in 0 seconds (Default Mode)


Start training of elastic_net (default mode)
root_mean_squared_error 211.45963139182405
r2 0.31012056127411053
Successful training and logging of elastic_net, in 0 seconds (Default Mode)


Start training of kernel_ridge (default mode)
root_mean_square

<a id='2_norm_targets'></a>

## Targets Normalisés

In [17]:
data_norm_targets = data.drop(columns=[target_energy, target_emissions])

### Energy ⚡

In [18]:
data_norm_targets_energy = data_norm_targets.drop(columns=[target_emissions_norm])
data_train, data_test = train_test_split(data_norm_targets_energy, train_size=0.7, random_state=41)

#### Baseline Naïve

In [19]:
dummy_scores_energy_norm = get_naive_baseline(target_energy_norm)

Score R² du dummy Regressor:
-0.0002

Mean Squared Error du dummy Regressor:
1.1292721305422049



#### Grid Search

In [20]:
launch_models_CV(
    data_train,
    target_energy_norm,
    models_params, 
    default_mode=False,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Grid Search CV norm Targets ESS',
    n_jobs=-1)

Start training of random_forest with GridSearch and CV
root_mean_squared_error :  0.35934110524235785
r2 :  0.8923373033826493
Successful training and logging of random_forest, in 43 seconds


Start training of gradient_boosting with GridSearch and CV
root_mean_squared_error :  0.33998034198865357
r2 :  0.903622718722673
Successful training and logging of gradient_boosting, in 26 seconds


Start training of xgboost with GridSearch and CV
root_mean_squared_error :  0.3136611138957736
r2 :  0.9179430443365902
Successful training and logging of xgboost, in 109 seconds


Start training of lasso with GridSearch and CV


  model = cd_fast.enet_coordinate_descent(


root_mean_squared_error :  0.5587792259223177
r2 :  0.7397074358554846
Successful training and logging of lasso, in 7 seconds


Start training of elastic_net with GridSearch and CV


  model = cd_fast.enet_coordinate_descent(


root_mean_squared_error :  0.5577405415672507
r2 :  0.740666850126404
Successful training and logging of elastic_net, in 25 seconds


Start training of kernel_ridge with GridSearch and CV


  dual_coef = linalg.solve(K, y, sym_pos=True,


root_mean_squared_error :  0.5597762184143503
r2 :  0.738695319367569
Successful training and logging of kernel_ridge, in 153 seconds


Start training of svr with GridSearch and CV
root_mean_squared_error :  0.6061213877595454
r2 :  0.693816008316137
Successful training and logging of svr, in 24 seconds




#### Défaut CV

In [21]:
launch_models_CV(
    data_train,
    target_energy_norm,
    models_params, 
    default_mode=True,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Default mode CV norm Targets ESS',
    n_jobs=-1)

Start training of random_forest (default mode)
root_mean_squared_error 0.35904770607756703
r2 0.8924886841609355
Successful training and logging of random_forest, in 3 seconds (Default Mode)


Start training of gradient_boosting (default mode)
root_mean_squared_error 0.3595775129158543
r2 0.8921178458403326
Successful training and logging of gradient_boosting, in 1 seconds (Default Mode)


Start training of xgboost (default mode)
root_mean_squared_error 0.3402930498273669
r2 0.9034879155478249
Successful training and logging of xgboost, in 1 seconds (Default Mode)


Start training of lasso (default mode)
root_mean_squared_error 0.7494590976970071
r2 0.5319976627641075
Successful training and logging of lasso, in 0 seconds (Default Mode)


Start training of elastic_net (default mode)
root_mean_squared_error 0.7483434685736293
r2 0.5333904334840674
Successful training and logging of elastic_net, in 0 seconds (Default Mode)


Start training of kernel_ridge (default mode)
root_mean_squared

### Emissions 🌋

In [2]:
data_norm_targets_emissions = data_norm_targets.drop(columns=[target_energy_norm])
data_train, data_test = train_test_split(data_norm_targets_emissions, train_size=0.7, random_state=41)

NameError: name 'data_norm_targets' is not defined

#### Baseline Naïve

In [23]:
dummy_scores_emissions_norm = get_naive_baseline(target_emissions_norm)

Score R² du dummy Regressor:
-0.0007

Mean Squared Error du dummy Regressor:
1.7727032216472645



#### Grid Search

In [24]:
launch_models_CV(
    data_train,
    target_emissions_norm,
    models_params, 
    default_mode=False,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Grid Search CV norm Targets ESS',
    n_jobs=-1)

Start training of random_forest with GridSearch and CV
root_mean_squared_error :  0.6916968364744059
r2 :  0.7489890110878195
Successful training and logging of random_forest, in 44 seconds


Start training of gradient_boosting with GridSearch and CV
root_mean_squared_error :  0.6842127753174613
r2 :  0.7542802604625884
Successful training and logging of gradient_boosting, in 26 seconds


Start training of xgboost with GridSearch and CV
root_mean_squared_error :  0.6618928273859336
r2 :  0.7699196083106162
Successful training and logging of xgboost, in 130 seconds


Start training of lasso with GridSearch and CV


  model = cd_fast.enet_coordinate_descent(


root_mean_squared_error :  0.9226199887656442
r2 :  0.5534622487274762
Successful training and logging of lasso, in 8 seconds


Start training of elastic_net with GridSearch and CV


  model = cd_fast.enet_coordinate_descent(


root_mean_squared_error :  0.9210933952047968
r2 :  0.5549648823996312
Successful training and logging of elastic_net, in 26 seconds


Start training of kernel_ridge with GridSearch and CV


  dual_coef = linalg.solve(K, y, sym_pos=True,


root_mean_squared_error :  0.9266076481375816
r2 :  0.5496902621889431
Successful training and logging of kernel_ridge, in 158 seconds


Start training of svr with GridSearch and CV
root_mean_squared_error :  1.0395504769180242
r2 :  0.4336897481467201
Successful training and logging of svr, in 17 seconds




#### Défaut CV

In [25]:
launch_models_CV(
    data_train,
    target_emissions_norm,
    models_params, 
    default_mode=True,
    folds=3, 
    scorings=['neg_root_mean_squared_error', 'r2'],
    favorite_scoring='neg_root_mean_squared_error',
    comment='Default mode CV norm Targets ESS',
    n_jobs=-1)

Start training of random_forest (default mode)
root_mean_squared_error 0.690326482517273
r2 0.7499174379643514
Successful training and logging of random_forest, in 3 seconds (Default Mode)


Start training of gradient_boosting (default mode)
root_mean_squared_error 0.7450024483704212
r2 0.708371804935345
Successful training and logging of gradient_boosting, in 1 seconds (Default Mode)


Start training of xgboost (default mode)
root_mean_squared_error 0.6996091551398621
r2 0.7431365968993195
Successful training and logging of xgboost, in 1 seconds (Default Mode)


Start training of lasso (default mode)
root_mean_squared_error 1.1198511616465303
r2 0.3427309902411541
Successful training and logging of lasso, in 0 seconds (Default Mode)


Start training of elastic_net (default mode)
root_mean_squared_error 1.1190909750015434
r2 0.3436292149076257
Successful training and logging of elastic_net, in 0 seconds (Default Mode)


Start training of kernel_ridge (default mode)
root_mean_squared_er

<a id='2_conclusions_train'></a>

## Conclusions post entrainement 😌

Dans tous les cas, le dummy regressor est battu (ouf!)
Les modèles les plus performants sont :
- Random Forest
- Gradient boosting
- XGBoost (Meilleur toujours)

Les données normalisées ont les meilleurs resultats (en r²)

On retient le XGBOOST dans les deux cas

EnergyStarScore apporte un peu de précision mais on peut s'en passer pour des raisons de cout et on a un set d'entrainement plus grand

### XGBoost : 

<img src="plots_storage/best_model.png" style="height:300px">

<a id='1_refit'></a>

# Refit XGBoost energy et emissions 
On utilisera dans cette partie les cibles normalisées

<a id='2_refit_ESS'></a>

## Avec EnergyStarScore

In [321]:
data = pd.read_csv('data/data_full_post_analisis.csv')
data.drop(columns=['Unnamed: 0', 'CouncilDistrictCode', 'DataYear'], inplace=True)
data.dropna(inplace=True)
data.set_index('OSEBuildingID', inplace=True)
data = data[list(data.dtypes[data.dtypes != "object"].index)]
data.drop(columns=['SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)',
       'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 
       'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)',
       'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)',
       'GHGEmissionsIntensity'], inplace=True)

In [322]:
data_norm_targets = data.drop(columns=[target_energy, target_emissions])

## Energy

In [323]:
data_norm_targets_energy = data_norm_targets.drop(columns=[target_emissions_norm])
data_train, data_test = train_test_split(data_norm_targets_energy, train_size=0.7, random_state=41)

In [324]:
xgboost_energy_ess = XGBRegressor(learning_rate=0.1,
                                 max_depth=20,
                                 reg_alpha=0.01,
                                 min_child_weight=5)

In [325]:
start = time.time()
xgboost_energy_ess.fit(data_train.drop(columns=[target_energy_norm]), data_train[target_energy_norm])
print("temps d'entrainement : {} s".format(time.time() - start))

temps d'entrainement : 0.9468948841094971 s


In [326]:
%%time
prediction = xgboost_energy_ess.predict(data_test.drop(columns=[target_energy_norm]))

Wall time: 13.9 ms


In [327]:
R2_energy_ESS = r2_score(prediction, data_test[target_energy_norm])
print(R2_energy_ESS)

0.9143730142386062


In [328]:
RMSE_energy_ESS = math.sqrt(mean_squared_error(np.expm1(data_test[target_energy_norm]), np.expm1(prediction)))
print(RMSE_energy_ESS)

2258253.7090832107


## Emissions

In [329]:
data_norm_targets_emissions = data_norm_targets.drop(columns=[target_energy_norm])
data_train, data_test = train_test_split(data_norm_targets_emissions, train_size=0.7, random_state=41)

In [330]:
xgboost_emissions_ess = XGBRegressor(learning_rate=0.1,
                                     max_depth=20,
                                     reg_alpha=0.01,
                                     min_child_weight=10)

In [331]:
start = time.time()
xgboost_emissions_ess.fit(data_train.drop(columns=[target_emissions_norm]), data_train[target_emissions_norm])
print("temps d'entrainement : {} s".format(time.time() - start))

temps d'entrainement : 0.996997594833374 s


In [332]:
%%time
prediction = xgboost_emissions_ess.predict(data_test.drop(columns=[target_emissions_norm]))

Wall time: 12.9 ms


In [333]:
R2_emissions_ESS = r2_score(prediction, data_test[target_emissions_norm])
print(R2_emissions_ESS)

0.76406219597108


In [334]:
RMSE_emissions_ESS = math.sqrt(mean_squared_error(np.expm1(data_test[target_emissions_norm]), np.expm1(prediction)))
print(RMSE_emissions_ESS)

61.899947998758776


<a id='2_refit_no_ESS'></a>


## Sans EnergyStarScore

In [335]:
data = pd.read_csv('data/data_full_post_analisis.csv')
data.drop(columns=['Unnamed: 0', 'CouncilDistrictCode', 'DataYear'], inplace=True)
data.dropna(inplace=True)
data.set_index('OSEBuildingID', inplace=True)
data = data[list(data.dtypes[data.dtypes != "object"].index)]
data.drop(columns=['SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)',
       'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 
       'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)',
       'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)',
       'GHGEmissionsIntensity'], inplace=True)
# On supprime en plus la colonne à tester
data.drop(columns=['ENERGYSTARScore'], inplace=True)

In [336]:
data_norm_targets = data.drop(columns=[target_energy, target_emissions])

## Energy

In [337]:
data_norm_targets_energy = data_norm_targets.drop(columns=[target_emissions_norm])
data_train, data_test = train_test_split(data_norm_targets_energy, train_size=0.7, random_state=41)

In [338]:
# xgboost_energy = XGBRegressor(learning_rate=0.1,
#                                  max_depth=20,
#                                  reg_alpha=0.01,
#                                  min_child_weight=5,
#                                  random_state=41)

In [339]:
start = time.time()
xgboost_energy = XGBRegressor(learning_rate=0.1,
                                 max_depth=20,
                                 reg_alpha=0.01,
                                 min_child_weight=5,
                                 random_state=41).fit(\
                                                            data_train.drop(columns=[target_energy_norm]), 
                                                            data_train[target_energy_norm])
print("temps d'entrainement : {} s".format(time.time() - start))

temps d'entrainement : 0.9771218299865723 s


In [340]:
%%time
prediction = xgboost_energy.predict(data_test.drop(columns=[target_energy_norm]))

Wall time: 12.4 ms


In [341]:
R2_energy = r2_score(prediction, data_test[target_energy_norm])
print(R2_energy)

0.8665857483148065


In [342]:
RMSE_energy = math.sqrt(mean_squared_error(np.expm1(data_test[target_energy_norm]), np.expm1(prediction)))
print(RMSE_energy)

2463035.0562310405


## Emissions

In [343]:
data_norm_targets_emissions = data_norm_targets.drop(columns=[target_energy_norm])
data_train, data_test = train_test_split(data_norm_targets_emissions, train_size=0.7, random_state=41)

In [344]:
start = time.time()
xgboost_emissions = XGBRegressor(learning_rate=0.1,
                                     max_depth=20,
                                     reg_alpha=0.01,
                                     min_child_weight=10).fit(\
                                                            data_train.drop(columns=[target_emissions_norm]), 
                                                            data_train[target_emissions_norm])
print("temps d'entrainement : {} s".format(time.time() - start))

temps d'entrainement : 1.0187911987304688 s


In [345]:
%%time
prediction = xgboost_emissions.predict(data_test.drop(columns=[target_emissions_norm]))

Wall time: 12.9 ms


In [346]:
R2_emissions = r2_score(prediction, data_test[target_emissions_norm])
print(R2_emissions)

0.7418237107311857


In [347]:
RMSE_emissions = math.sqrt(mean_squared_error(np.expm1(data_test[target_emissions_norm]), np.expm1(prediction)))
print(RMSE_emissions)

70.64435433759046


<a id='2_refit_conclusions'></a>

## Conclusions du refit

Améliorations relatives

RMSE Energy

In [348]:
(RMSE_energy - RMSE_energy_ESS) / RMSE_energy * 100 # BUG ici, pas de solution...

8.314187271909486

R² Energy

In [349]:
(R2_energy_ESS - R2_energy) / R2_energy * 100

5.514430166515949

RMSE Emissions

In [352]:
(RMSE_emissions - RMSE_emissions_ESS) / RMSE_emissions * 100

12.378068170946122

R² Emissions

In [351]:
(R2_emissions_ESS - R2_emissions) / R2_emissions * 100

2.9978126768116886

<a id='1_train_time'></a>

# Temps d'entrainement

Comparaison en prenant en compte l'estimation de l'energy avec ESS et targets normalisés

In [132]:
best_params  = {  
    'random_forest': {
        'params': {
           'max_depth' : 20,
           'max_features' : 'auto',
           'min_samples_leaf': 1,
           'min_samples_split': 2,
           'random_state': 41
           },
        'estimator': RandomForestRegressor,
    },
                        
    'gradient_boosting': {
         'params': {
             'max_depth' : 5,
             'learning_rate' : 0.2,
             'n_iter_no_change': None,
             'random_state': 41
             },
         'estimator': GradientBoostingRegressor
    },
                 
                 
    'xgboost': {
        'params':{
            'max_depth' : 20,
            'learning_rate' : 0.1,
            'reg_alpha': 0.01,
            'min_child_weight': 5,
            'random_state': 41
            },
        'estimator': XGBRegressor
    },

   'lasso': {
       'params': {
           'alpha': 0.001,
           'random_state': 41,
           'max_iter': 10000
           },
       'estimator' : Lasso
    },
    
    'elastic_net': {
        'params': {
            'alpha' : 0.0001,
            'l1_ratio' : 0.5,
            'max_iter': 10000
            },
        'estimator': ElasticNet
    },
                 
    'kernel_ridge': {
        'params': {
            'alpha': 0.01,
            'kernel': 'linear',
            },
        'estimator': KernelRidge
    },
    
    'svr':{
        'params':{
            "kernel":'rbf',
            "C":1,
            'epsilon':0.3,
            'max_iter': 10000
        },
        'estimator': SVR
    }
}

In [27]:
mlflow.create_experiment("Predict_Times")

'15'

In [1]:
data_train, data_test = train_test_split(data_norm_targets_energy, train_size=0.7, random_state=41)
mlflow.set_experiment("Predict_Times")
for model in best_params.keys():
    mlflow.start_run(run_name=model)
    start_fit = time.time()
    estimator = best_params[model]['estimator'](**best_params[model]['params']).fit(\
                                                            data_train.drop(columns=[target_energy_norm]), 
                                                            data_train[target_energy_norm])
    fit_time = time.time() - start_fit
    
    start_predict = time.time()
    estimator.predict(data_test.drop(columns=[target_energy_norm]))
    predict_time = time.time() - start_predict
    
    mlflow.log_metric('Fit Time', fit_time)
    mlflow.log_metric('Predict Time', predict_time)
    
    mlflow.end_run()
    

NameError: name 'train_test_split' is not defined

<a id='1_conclusion'></a>

# Conclusion

- Les méthodes ensemblistes sont plus efficaces que les méthodes linéaires pour ce cas d’usage
- Le XGBoost apporte les meilleures performances
- L’  « Energy Star Score » apporte de la précision non négligeable notamment pour la consommation électrique.

#### Perspectives
- Utiliser un Voting Regressor pour rassembler les modèles
- Créer des variables pour mieux qualifier les consommations par quartiers
- Appliquer le log à plus de variables
- Faire un GridSearch plus poussé avec plus de paramètres. Notamment pour le XGBoost.