---

# Utilities Pipeline

### 00 Loading Data

In [67]:
%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline, make_union, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from etl.utilities import Utilities

## Would load cleaned data using '''load.py''' once data cleaning is done
data = Utilities().get_training_data()

# create X and y
X = data.drop(columns='electricity_demmand')
y = data['electricity_demmand']

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_features = list(X_train.columns)
data.dtypes
data.head(3)

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings,electricity_demmand
0,Office,169416,1909,95,1,1920103.6
1,K-12 School,94380,1963,100,1,180640.0
3,Hotel,50000,1994,100,1,579335.2


### 01 Preprocessor 🧮

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

# Create a custom transformer that multiplies/divides two columns
occupied_area = FunctionTransformer(lambda df: pd.DataFrame({'occupied_area': df["building_gfa"] * (df["occupancy"] * 0.01)}), validate=False)

feature_engineering = ColumnTransformer(
    [("occupied_area", occupied_area, X_train_features)],
    remainder = "passthrough"
)

In [70]:
cat_features = [feature for feature in data.columns if data[feature].dtype == 'object']
num_features = [feature for feature in data.columns if data[feature].dtype in ['float64', 'int64'] and feature != 'electricity_demmand']

# scale and impute numerical features
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)],
        
    remainder='passthrough')


preprocessor

In [71]:
preprocessor = FeatureUnion([
    ('preprocessor', preprocessor),
    ('feature_engineering', Pipeline([
        ('occupied_area', occupied_area),
        ('occupied_area_scaler', StandardScaler())
    ]))
])

preprocessor

In [72]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
# display(pd.DataFrame(X_train_transformed, columns=['occupied_area'] + preprocessor.get_feature_names_out()).head(5))
display(pd.DataFrame(X_train_transformed))

Original training set


Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
3625,Multifamily Housing,54494,2002,100,1
16037,Multifamily Housing,34920,1931,100,1
12933,Multifamily Housing,44568,1994,100,1


Preprocessed training set


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-0.38,1.75,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.38
1,-0.50,-0.48,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.50
2,-0.44,1.50,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.44
3,-0.49,1.60,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,-0.49
4,-0.54,-0.04,0.17,-1.39,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13495,-0.55,-1.15,0.17,-0.11,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.55
13496,2.40,-1.40,-0.60,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,2.32
13497,0.03,-0.23,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.04
13498,-0.41,2.04,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.41


### 02 Fitting Model 🧠 

In [73]:
from sklearn.linear_model import SGDRegressor

pipeline = make_pipeline(preprocessor, SGDRegressor(random_state=13))
pipeline


In [74]:
# train pipeline
pipeline.fit(X_train, y_train)

# Score model
score = pipeline.score(X_test, y_test)
print(f'score: {round(score, 2)}')

score: 0.81


### 03 Optimising pipeline 🔍

In [75]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('featureunion',
   FeatureUnion(transformer_list=[('preprocessor',
                                   ColumnTransformer(remainder='passthrough',
                                                     transformers=[('num_transformer',
                                                                    Pipeline(steps=[('imputer',
                                                                                     SimpleImputer()),
                                                                                    ('scaler',
                                                                                     StandardScaler())]),
                                                                    ['building_gfa',
                                                                     'year_built',
                                                                     'occupancy',
                                                                     'num_buildings']),
 

In [76]:
from sklearn.model_selection import RandomizedSearchCV

# which params are searchable
pipeline.get_params()

# define params to search
param_distributions = {
    # imputer
     'featureunion__preprocessor__num_transformer__imputer__strategy': ['mean','median','most_frequent'],
    
    #scaler
    'featureunion__preprocessor__num_transformer__scaler__with_mean': [True, False],
    'featureunion__preprocessor__num_transformer__scaler__with_std': [True, False],

    # model
    # 'sgdregressor__alpha': [ 0.001, 0.0005, 0.0001],
    # 'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    # 'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None'],
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

{'featureunion__preprocessor__num_transformer__scaler__with_std': True,
 'featureunion__preprocessor__num_transformer__scaler__with_mean': True,
 'featureunion__preprocessor__num_transformer__imputer__strategy': 'most_frequent'}

In [77]:
pipeline_tuned = randomized_search.best_estimator_

# train tuned pipeline
pipeline_tuned.fit(X_train, y_train)

# Score tuned model
tuned_score = pipeline_tuned.score(X_test, y_test)

print(f'score: {round(tuned_score, 2)}')

score: 0.81


### 04 Debugging the pipe 🐞

In [78]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()
# pipeline_tuned

dict_keys(['featureunion', 'sgdregressor'])

In [79]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["featureunion"].fit_transform(X_train).shape # notice the extra columnns as a result of the ordinal encoder

Before preprocessing, X_train.shape = 
(13500, 5)
After preprocessing, X_train_preprocessed.shape = 


(13500, 18)

### 05 Exporting the pipeline as a pickle 🥒

In [80]:
import pickle

# export pipe as pickle file
with open('etl/_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_tuned, file)

PicklingError: Can't pickle <function <lambda> at 0x0000025E022C5BC0>: attribute lookup <lambda> on __main__ failed

In [81]:
pipe = pickle.load(open('etl/_pipeline.pkl', 'rb'))

EOFError: Ran out of input

In [None]:
X_test[4:5]

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
24958,Multifamily Housing,195000,1951,100,1


In [None]:
print(f'Power demmand : {int(pipe.predict(X_test[4:5])[0])}')

Power demmand : 9988496
