---

# Utilities Pipeline

### 00 Loading Data

In [94]:
%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline, make_union, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split
from etl.utilities import Utilities

## Would load cleaned data using '''load.py''' once data cleaning is done
data = Utilities().get_training_data()

# create X and y
X = data.drop(columns='electricity_demmand')
y = data['electricity_demmand']

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_features = list(X_train.columns)
data.dtypes
data.head(3)

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings,electricity_demmand
0,Office,169416,1909,95,1,1920103.6
1,K-12 School,94380,1963,100,1,180640.0
3,Hotel,50000,1994,100,1,579335.2


In [96]:
# use this cell to analyse train data

### 01 Preprocessor 🧮 (1. feature engineering + 2. enocding/scaling)

In [97]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

# Define feature engineering functions
def calculate_occupied_area(data):
    data["occupied_area"] = data['building_gfa'] * (data['occupancy'] * 0.01)
    return data

def calculate_new_build(data):
    data["new_build"] = 0  # Initialize to 0
    
    # Update to 1 for rows where the condition is met
    data.loc[data['year_built'] >= 1980, 'new_build'] = 1
    
    return data

# convert functions to transformers
occupied_area = FunctionTransformer(calculate_occupied_area, validate=False)
new_build = FunctionTransformer(calculate_new_build, validate=False)

# create feature engineering tranformer
feature_engineering = ColumnTransformer(
    [
        ("occupied_area", occupied_area, ['building_gfa','occupancy' ]),
        ("new_build", new_build,['year_built'] )
    ],
    remainder="passthrough"
).set_output(transform = "pandas")

pd.DataFrame(feature_engineering.fit_transform(X_train))
# transformer_names = list(feature_engineering.named_transformers_.keys())
# feature_engineering.



Unnamed: 0,occupied_area__building_gfa,occupied_area__occupancy,occupied_area__occupied_area,new_build__year_built,new_build__new_build,remainder__building_typology,remainder__num_buildings
8287,145644,100,145644.00,1911,0,Office,1
4579,78300,100,78300.00,2011,1,Multifamily Housing,1
13995,66291,100,66291.00,1923,0,Multifamily Housing,1
16097,108377,100,108377.00,1913,0,Multifamily Housing,1
844,1189698,100,1189698.00,1983,1,Office,1
...,...,...,...,...,...,...,...
2335,815186,85,692908.10,1929,0,Office,1
26560,73632,100,73632.00,1995,1,Multifamily Housing,1
16084,36216,100,36216.00,1940,0,Multifamily Housing,1
11509,1071600,100,1071600.00,1974,0,Hospital (General Medical & Surgical),1


In [98]:
# OPTION 01
num_preproc = Pipeline([
    ("num_imputer", SimpleImputer(strategy = "constant", fill_value=0.)),
    ("scaler", StandardScaler())
])

cat_preproc = Pipeline([
    ("cat_imputer", SimpleImputer(strategy = "constant", fill_value="Missing")),
    ("ohe", OneHotEncoder(handle_unknown = "ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num_tr", num_preproc, make_column_selector(dtype_include = ["float64", "int64"])),
    ("cat_tr", cat_preproc, make_column_selector(dtype_include = ["object"]))
])
preprocessor
pd.DataFrame(preprocessor.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.15,-1.12,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
1,-0.24,2.04,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
2,-0.31,-0.74,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
3,-0.07,-1.05,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
4,6.10,1.16,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13495,3.97,-0.55,-2.19,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
13496,-0.26,1.54,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
13497,-0.48,-0.20,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
13498,5.43,0.87,0.17,-0.11,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [99]:
# # OPTION 02
# cat_features = [feature for feature in data.columns if data[feature].dtype == 'object']
# num_features = [feature for feature in data.columns if data[feature].dtype in ['float64', 'int64'] and feature != 'electricity_demmand']

# # scale and impute numerical features
# num_transformer = Pipeline([
#     ('imputer', SimpleImputer()),
#     ('scaler', StandardScaler())
# ])

# # Encode categorical values
# cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

# preprocessor = ColumnTransformer([
#     ('num_transformer', num_transformer, num_features),
#     ('cat_transformer', cat_transformer, cat_features)],
        
#     remainder='passthrough')


# preprocessor

In [100]:
preprocessor = Pipeline([
    ("feature_engineering", feature_engineering),
    ("preprocessing", preprocessor)]).set_output(transform = "pandas")

preprocessor



In [101]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
# display(pd.DataFrame(X_train_transformed, columns=['occupied_area'] + preprocessor.get_feature_names_out()).head(5))
display(pd.DataFrame(X_train_transformed))

Original training set


Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
8287,Office,145644,1911,100,1
4579,Multifamily Housing,78300,2011,100,1
13995,Multifamily Housing,66291,1923,100,1


Preprocessed training set


Unnamed: 0,num_tr__occupied_area__building_gfa,num_tr__occupied_area__occupancy,num_tr__occupied_area__occupied_area,num_tr__new_build__year_built,num_tr__new_build__new_build,num_tr__remainder__num_buildings,cat_tr__remainder__building_typology_College/University,cat_tr__remainder__building_typology_Courthouse,cat_tr__remainder__building_typology_Hospital (General Medical & Surgical),cat_tr__remainder__building_typology_Hotel,cat_tr__remainder__building_typology_K-12 School,cat_tr__remainder__building_typology_Laboratory,cat_tr__remainder__building_typology_Multifamily Housing,cat_tr__remainder__building_typology_Museum,cat_tr__remainder__building_typology_Office,cat_tr__remainder__building_typology_Other - Mall,cat_tr__remainder__building_typology_Performing Arts,cat_tr__remainder__building_typology_Prison/Incarceration,cat_tr__remainder__building_typology_Retail Store
8287,0.15,0.17,0.17,-1.12,-0.47,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
4579,-0.24,0.17,-0.23,2.04,2.12,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
13995,-0.31,0.17,-0.30,-0.74,-0.47,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
16097,-0.07,0.17,-0.06,-1.05,-0.47,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
844,6.10,0.17,6.34,1.16,2.12,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335,3.97,-2.19,3.40,-0.55,-0.47,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
26560,-0.26,0.17,-0.26,1.54,2.12,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
16084,-0.48,0.17,-0.48,-0.20,-0.47,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
11509,5.43,0.17,5.64,0.87,-0.47,-0.11,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


### 02 Fitting Model 🧠 

In [127]:
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

pipeline = make_pipeline(preprocessor, GradientBoostingRegressor())
pipeline


In [129]:
# train pipeline
pipeline.fit(X_train, y_train)

# Score model
score = pipeline.score(X_test, y_test)
print(f'score: {round(score, 2)}')

score: 0.81


### 03 Optimising pipeline 🔍

In [130]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('pipeline', Pipeline(steps=[('feature_engineering',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('occupied_area',
                                                     FunctionTransformer(func=<function calculate_occupied_area at 0x000001EDA53E49A0>),
                                                     ['building_gfa',
                                                      'occupancy']),
                                                    ('new_build',
                                                     FunctionTransformer(func=<function calculate_new_build at 0x000001EDA53E7880>),
                                                     ['year_built'])])),
                   ('preprocessing',
                    Col...
                                                                      StandardScaler())]),
                                                     <sklearn.compose._column_trans

In [132]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# which params are searchable
pipeline.get_params()

# define params to search
param_distributions = {
    # imputer
     'pipeline__preprocessing__num_tr__num_imputer__strategy': ['mean','median','most_frequent'],
    
    #scaler
    'pipeline__preprocessing__num_tr__scaler__with_mean': [True, False],
    'pipeline__preprocessing__num_tr__scaler__with_std': [True, False],

    # model
    # 'sgdregressor__alpha': [ 0.001, 0.0005, 0.0001],
    # 'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    # 'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None'],
    'gradientboostingregressor__n_estimators': np.arange(50, 100, 10),
    'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'gradientboostingregressor__max_depth': [3, 5, 7, 9],
    'gradientboostingregressor__min_samples_split': [2, 5, 10],
    'gradientboostingregressor__min_samples_leaf': [1, 2, 4],
    'gradientboostingregressor__subsample': [0.8, 0.9, 1.0]
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

{'pipeline__preprocessing__num_tr__scaler__with_std': False,
 'pipeline__preprocessing__num_tr__scaler__with_mean': True,
 'pipeline__preprocessing__num_tr__num_imputer__strategy': 'most_frequent',
 'gradientboostingregressor__subsample': 1.0,
 'gradientboostingregressor__n_estimators': 90,
 'gradientboostingregressor__min_samples_split': 5,
 'gradientboostingregressor__min_samples_leaf': 4,
 'gradientboostingregressor__max_depth': 3,
 'gradientboostingregressor__learning_rate': 0.1}

In [133]:
pipeline_tuned = randomized_search.best_estimator_

# Score tuned model
tuned_score = pipeline_tuned.score(X_test, y_test)

print(f'score: {round(tuned_score, 2)}')

score: 0.82


### 04 Debugging the pipe 🐞

In [134]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()
# pipeline_tuned

dict_keys(['pipeline', 'gradientboostingregressor'])

In [135]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["pipeline"].fit_transform(X_train).shape # notice the extra columnns as a result of the ordinal encoder

Before preprocessing, X_train.shape = 
(13500, 5)
After preprocessing, X_train_preprocessed.shape = 


(13500, 19)

### 05 Exporting the pipeline as a pickle 🥒

In [136]:
import pickle

# export pipe as pickle file
with open('etl/_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_tuned, file)

In [137]:
pipe = pickle.load(open('etl/_pipeline.pkl', 'rb'))

In [138]:
X_test[4:5]

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
12998,Multifamily Housing,77677,1900,100,1


In [139]:
print(f'Power demmand : {int(pipe.predict(X_test[4:5])[0])}')

Power demmand : 515098
