---

# Utilities Pipeline

### 00 Loading Data

In [106]:
%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from etl.utilities import Utilities

## Would load cleaned data using '''load.py''' once data cleaning is done
data = Utilities().get_training_data()

# create X and y
X = data.drop(columns='electricity_demmand')
y = data['electricity_demmand']

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
data.head(3)

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings,electricity_demmand
0,Office,169416,1909,95,1,1920103.6
1,K-12 School,94380,1963,100,1,180640.0
3,Hotel,50000,1994,100,1,579335.2


### 01 Preprocessor 🧮

In [109]:
cat_features = [feature for feature in data.columns if data[feature].dtype == 'object']
num_features = [feature for feature in data.columns if data[feature].dtype in ['float64', 'int'] and feature != 'electricity_demmand']


In [131]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, cat_features),
    ('num_transformer', num_transformer, num_features)],
    remainder='passthrough')

In [132]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
display(pd.DataFrame(X_train_transformed).head(5))

Original training set


Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
16534,Multifamily Housing,66780,1953,100,1
2861,Multifamily Housing,50000,1930,100,1
24354,Multifamily Housing,142500,1974,95,1


Preprocessed training set


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.3,0.22,0.17,-0.11
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.4,-0.51,0.17,-0.11
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.89,-0.62,-0.11
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.33,-0.47,0.17,-0.11
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.61,-0.44,0.17,-0.11


### 02 Fitting Model 🧠 

In [153]:
from sklearn.linear_model import SGDRegressor

pipeline = make_pipeline(preprocessor, SGDRegressor(random_state=3))
pipeline


In [154]:
# train pipeline
pipeline.fit(X_train, y_train)

# Score model
print(f'score: {pipeline.score(X_test,y_test)}')
pipeline

score: 0.7711721434986941


### 03 Optimising pipeline 🔍

In [147]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('cat_transformer',
                                    OneHotEncoder(drop='if_binary',
                                                  handle_unknown='ignore',
                                                  sparse_output=False),
                                    ['building_typology']),
                                   ('num_transformer',
                                    Pipeline(steps=[('imputer', SimpleImputer()),
                                                    ('scaler', StandardScaler())]),
                                    ['building_gfa', 'year_built', 'occupancy',
                                     'num_buildings'])])),
  ('sgdregressor', SGDRegressor(random_state=3))],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('cat_transformer',
                          

In [155]:
from sklearn.model_selection import RandomizedSearchCV

# which params are searchable
pipeline.get_params()

# define params to search
param_distributions = {
    'columntransformer__num_transformer__imputer__strategy': ['mean','median','most_frequent'],
    'sgdregressor__alpha': [0.0001,0.001, 0.01, 0.1],
    'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None']
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/hramzan/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/hramzan/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/hramzan/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/User

{'sgdregressor__penalty': 'l1',
 'sgdregressor__loss': 'squared_epsilon_insensitive',
 'sgdregressor__alpha': 0.1,
 'columntransformer__num_transformer__imputer__strategy': 'median'}

In [149]:
pipeline_tuned = randomized_search.best_estimator_

# train tuned pipeline
pipeline_tuned.fit(X_train, y_train)

# Score tuned model
print(f'score: {pipeline_tuned.score(X_test,y_test)}')

score: 0.7716033192196694


### 04 Debugging the pipe 🐞

In [150]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()
# pipeline_tuned

dict_keys(['columntransformer', 'sgdregressor'])

In [151]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["columntransformer"].fit_transform(X_train).shape # notice the extra columnns as a result of the ordinal encoder

Before preprocessing, X_train.shape = 
(13500, 5)
After preprocessing, X_train_preprocessed.shape = 


(13500, 17)

### 05 Exporting the pipeline as a pickle 🥒

In [152]:
import pickle

# export pipe as pickle file
with open('etl/_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_tuned, file)
