---

# Utilities Pipeline

### 00 Loading Data

In [188]:
%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline, make_union, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [189]:
import pandas as pd
from sklearn.model_selection import train_test_split
from etl.utilities import Utilities

## Would load cleaned data using '''load.py''' once data cleaning is done
data = Utilities().get_training_data()

# create X and y
X = data.drop(columns='electricity_demmand')
y = data['electricity_demmand']

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
data.dtypes

building_typology       object
building_gfa             int64
year_built               int64
occupancy                int64
num_buildings            int64
electricity_demmand    float64
dtype: object

### 01 Preprocessor 🧮

In [190]:
cat_features = [feature for feature in data.columns if data[feature].dtype == 'object']
num_features = [feature for feature in data.columns if data[feature].dtype in ['float64', 'int64'] and feature != 'electricity_demmand']


In [200]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


# scale and impute numerical features
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)],
    
    remainder='passthrough')


preprocessor

In [201]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
display(pd.DataFrame(X_train_transformed).head(5))

Original training set


Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
20922,Multifamily Housing,31902,1909,100,1
27940,Office,68000,1911,100,1
16225,Multifamily Housing,101297,1991,100,1


Preprocessed training set


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-0.51,-1.18,0.17,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.3,-1.11,0.17,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.1,1.43,0.17,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.84,1.08,0.17,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.47,-0.64,0.17,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### 02 Fitting Model 🧠 

In [219]:
from sklearn.linear_model import SGDRegressor

pipeline = make_pipeline(preprocessor, SGDRegressor())
pipeline


In [223]:
# train pipeline
pipeline.fit(X_train, y_train)

# Score model
print(f'score: {pipeline.score(X_test,y_test)}')
pipeline

score: 0.7747543772442849


### 03 Optimising pipeline 🔍

In [None]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('num_transformer',
                                    Pipeline(steps=[('occupied_area',
                                                     FunctionTransformer(func=<function <lambda> at 0x000002161C2DAE80>)),
                                                    ('imputer', SimpleImputer()),
                                                    ('scaler', StandardScaler())]),
                                    ['building_gfa', 'year_built', 'occupancy',
                                     'num_buildings']),
                                   ('cat_transformer',
                                    OneHotEncoder(drop='if_binary',
                                                  handle_unknown='ignore',
                                                  sparse_output=False),
                                    ['building_typology'])])),
  ('sgdregressor', SG

In [222]:
from sklearn.model_selection import RandomizedSearchCV

# which params are searchable
pipeline.get_params()

# define params to search
param_distributions = {
    # imputer
    'columntransformer__num_transformer__imputer__strategy': ['mean','median','most_frequent'],
    
    #scaler
    'columntransformer__num_transformer__scaler__with_mean': [True, False],
    'columntransformer__num_transformer__scaler__with_std': [True, False],

    # model
    'sgdregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
    'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None'],
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=10,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

20 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 1582, in fit
    self._validate_params()
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklea

{'sgdregressor__penalty': 'elasticnet',
 'sgdregressor__loss': 'squared_epsilon_insensitive',
 'sgdregressor__alpha': 0.001,
 'columntransformer__num_transformer__scaler__with_std': True,
 'columntransformer__num_transformer__scaler__with_mean': True,
 'columntransformer__num_transformer__imputer__strategy': 'most_frequent'}

In [224]:
pipeline_tuned = randomized_search.best_estimator_

# train tuned pipeline
pipeline_tuned.fit(X_train, y_train)

# Score tuned model
print(f'score: {pipeline_tuned.score(X_test,y_test)}')

score: 0.7785543318444353


### 04 Debugging the pipe 🐞

In [None]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()
# pipeline_tuned

dict_keys(['columntransformer', 'sgdregressor'])

In [None]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["columntransformer"].fit_transform(X_train).shape # notice the extra columnns as a result of the ordinal encoder

Before preprocessing, X_train.shape = 
(13500, 5)
After preprocessing, X_train_preprocessed.shape = 


(13500, 17)

### 05 Exporting the pipeline as a pickle 🥒

In [None]:
import pickle

# export pipe as pickle file
with open('etl/_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_tuned, file)
