---

# Utilities Pipeline

In [1]:
%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from etl.utilities import Utilities

## Would load cleaned data using '''load.py''' once data cleaning is done
data = Utilities().get_training_data()

# create X and y
X = data.drop(columns='electricity_demmand')
y = data['electricity_demmand']

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
data.head(3)

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings,electricity_demmand
0,Office,169416,1909,95,1,1920103.6
1,K-12 School,94380,1963,100,1,180640.0
3,Hotel,50000,1994,100,1,579335.2


In [20]:
cat_features = [feature for feature in data.columns if data[feature].dtype == 'object']
num_features = [feature for feature in data.columns if data[feature].dtype in ['float64', 'int'] and feature != 'electricity_demmand']


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary',
                                handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, cat_features),
    ('num_transformer', num_transformer, num_features)],
    remainder='passthrough')

X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed

<13500x17 sparse matrix of type '<class 'numpy.float64'>'
	with 67500 stored elements in Compressed Sparse Row format>

### 01 Preprocessor 🧮

In [104]:
# Preprocessor
cat_transformer = OneHotEncoder()
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough'
)

preproc

In [113]:
# Assuming X_train is your training data
transformed_data = preproc.fit_transform(X_train)

# # Convert the transformed data back to a DataFrame for easier viewing
# columns = (make_column_selector(dtype_include=['float64'])(X_train) +
#            make_column_selector(dtype_include=['object', 'bool'])(X_train) +
#            list(X_train.columns[len(make_column_selector(dtype_include=['float64'])(X_train)) +
#                                 len(make_column_selector(dtype_include=['object', 'bool'])(X_train)):]))

# df_transformed = pd.DataFrame(transformed_data, columns=columns)

# # Display the transformed DataFrame
# print(df_transformed)

print(preproc.named_transformers_.keys())


dict_keys(['pipeline', 'onehotencoder', 'remainder'])


UFuncTypeError: ufunc 'maximum' did not contain a loop with signature matching types (dtype('<U8'), dtype('<U8')) -> None

### 02 Fitting Model 🧠 

In [None]:
from sklearn.linear_model import SGDRegressor

pipeline = make_pipeline(preproc, SGDRegressor())
pipeline

# train pipeline
pipeline.fit(X_train, y_train)

# Score model
print(f'score: {round(pipeline.score(X_test,y_test))}')
pipeline

score: -353748414645391920726016


### 03 Optimising pipeline 🔍

In [None]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x00000215ED21E450>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x00000215ED21E3D0>)])),
  ('sgdregressor', SGDRegressor())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
          

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# which params are searchable
pipeline.get_params()

# define params to search
param_distributions = {
    'columntransformer__pipeline__simpleimputer__strategy': ['mean','median','most_frequent'],
    'sgdregressor__alpha': [0.0001,0.001, 0.01, 0.1],
    'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None']
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 1582, in fit
    self._validate_params()
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklear

{'sgdregressor__penalty': 'l2',
 'sgdregressor__loss': 'huber',
 'sgdregressor__alpha': 0.001,
 'columntransformer__pipeline__simpleimputer__strategy': 'median'}

In [None]:
pipeline_tuned = randomized_search.best_estimator_
pipeline_tuned

### 04 Debugging the pipe 🐞

In [None]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()
# pipeline_tuned

dict_keys(['columntransformer', 'sgdregressor'])

In [None]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["columntransformer"].fit_transform(X_train).shape # notice the extra columnns as a result of the ordinal encoder

Before preprocessing, X_train.shape = 
(13500, 5)
After preprocessing, X_train_preprocessed.shape = 


(13500, 17)

### 05 Exporting the pipeline as a pickle 🥒

In [None]:
import pickle

# export pipe as pickle file
with open('etl/_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_tuned, file)
