---

# Utilities Pipeline

### 00 Loading Data

In [179]:
%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline, make_union, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [180]:
import pandas as pd
from sklearn.model_selection import train_test_split
from etl.utilities import Utilities

## Would load cleaned data using '''load.py''' once data cleaning is done
data = Utilities().get_training_data()

# create X and y
X = data.drop(columns='electricity_demmand')
y = data['electricity_demmand']

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_features = list(X_train.columns)
data.dtypes
data.head(3)

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings,electricity_demmand
0,Office,169416,1909,95,1,1920103.6
1,K-12 School,94380,1963,100,1,180640.0
3,Hotel,50000,1994,100,1,579335.2


In [181]:
# use this cell to analyse train data

### 01 Preprocessor 🧮 (1. feature engineering + 2. enocding/scaling)

In [182]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

# Define feature engineering functions
def calculate_occupied_area(data):
    data["occupied_area"] = data['building_gfa'] * (data['occupancy'] * 0.01)
    return data

def calculate_new_build(data):
    data["new_build"] = 0  # Initialize to 0
    
    # Update to 1 for rows where the condition is met
    data.loc[data['year_built'] >= 2000, 'new_build'] = 1
    
    return data

# convert functions to transformers
occupied_area = FunctionTransformer(calculate_occupied_area, validate=False)
new_build = FunctionTransformer(calculate_new_build, validate=False)

# create feature engineering tranformer
feature_engineering = ColumnTransformer(
    [
        ("occupied_area", occupied_area, ['building_gfa','occupancy' ]),
        ("new_build", new_build,['year_built'] )
    ],
    remainder="passthrough"
).set_output(transform = "pandas")

pd.DataFrame(feature_engineering.fit_transform(X_train))
# transformer_names = list(feature_engineering.named_transformers_.keys())
# feature_engineering.



Unnamed: 0,occupied_area__building_gfa,occupied_area__occupancy,occupied_area__occupied_area,new_build__year_built,new_build__new_build,remainder__building_typology,remainder__num_buildings
19213,116940,75,87705.00,1935,0,Multifamily Housing,1
3192,325062,100,325062.00,2005,1,Multifamily Housing,1
8186,169124,100,169124.00,1958,0,Multifamily Housing,1
25871,75568,90,68011.20,1900,0,Hotel,1
27496,40189,100,40189.00,1924,0,Multifamily Housing,1
...,...,...,...,...,...,...,...
15763,187996,100,187996.00,2016,1,Multifamily Housing,1
16698,26400,100,26400.00,1910,0,Multifamily Housing,1
28836,33302,90,29971.80,1910,0,Multifamily Housing,1
24098,60540,100,60540.00,1938,0,Multifamily Housing,1


In [183]:
# OPTION 01 Encoding and Scaling
num_preproc = Pipeline([
    ("num_imputer", SimpleImputer(strategy = "constant", fill_value=0.)),
    ("scaler", StandardScaler())
])

cat_preproc = Pipeline([
    ("cat_imputer", SimpleImputer(strategy = "constant", fill_value="Missing")),
    ("ohe", OneHotEncoder(handle_unknown = "ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num_tr", num_preproc, make_column_selector(dtype_include = ["float64", "int64"])),
    ("cat_tr", cat_preproc, make_column_selector(dtype_include = ["object"]))
])
preprocessor
pd.DataFrame(preprocessor.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-0.02,-0.35,-3.88,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
1,1.16,1.87,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
2,0.28,0.38,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
3,-0.25,-1.45,-1.45,-0.11,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,-0.45,-0.70,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13495,0.38,2.21,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
13496,-0.53,-1.14,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
13497,-0.49,-1.14,-1.45,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
13498,-0.34,-0.25,0.17,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00


In [184]:
# # OPTION 02
# cat_features = [feature for feature in data.columns if data[feature].dtype == 'object']
# num_features = [feature for feature in data.columns if data[feature].dtype in ['float64', 'int64'] and feature != 'electricity_demmand']

# # scale and impute numerical features
# num_transformer = Pipeline([
#     ('imputer', SimpleImputer()),
#     ('scaler', StandardScaler())
# ])

# # Encode categorical values
# cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

# preprocessor = ColumnTransformer([
#     ('num_transformer', num_transformer, num_features),
#     ('cat_transformer', cat_transformer, cat_features)],
        
#     remainder='passthrough')


# preprocessor

In [185]:
preprocessor = Pipeline([
    ("feature_engineering", feature_engineering),
    ("preprocessing", preprocessor)]).set_output(transform = "pandas")

preprocessor



In [186]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
# display(pd.DataFrame(X_train_transformed, columns=['occupied_area'] + preprocessor.get_feature_names_out()).head(5))
display(pd.DataFrame(X_train_transformed))

Original training set


Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
19213,Multifamily Housing,116940,1935,75,1
3192,Multifamily Housing,325062,2005,100,1
8186,Multifamily Housing,169124,1958,100,1


Preprocessed training set


Unnamed: 0,num_tr__occupied_area__building_gfa,num_tr__occupied_area__occupancy,num_tr__occupied_area__occupied_area,num_tr__new_build__year_built,num_tr__new_build__new_build,num_tr__remainder__num_buildings,cat_tr__remainder__building_typology_College/University,cat_tr__remainder__building_typology_Courthouse,cat_tr__remainder__building_typology_Hospital (General Medical & Surgical),cat_tr__remainder__building_typology_Hotel,cat_tr__remainder__building_typology_K-12 School,cat_tr__remainder__building_typology_Laboratory,cat_tr__remainder__building_typology_Multifamily Housing,cat_tr__remainder__building_typology_Museum,cat_tr__remainder__building_typology_Office,cat_tr__remainder__building_typology_Other - Mall,cat_tr__remainder__building_typology_Performing Arts,cat_tr__remainder__building_typology_Prison/Incarceration,cat_tr__remainder__building_typology_Retail Store
19213,-0.02,-3.88,-0.18,-0.35,-0.36,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
3192,1.16,0.17,1.22,1.87,2.79,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
8186,0.28,0.17,0.30,0.38,-0.36,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
25871,-0.25,-1.45,-0.30,-1.45,-0.36,-0.11,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
27496,-0.45,0.17,-0.46,-0.70,-0.36,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15763,0.38,0.17,0.41,2.21,2.79,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
16698,-0.53,0.17,-0.54,-1.14,-0.36,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
28836,-0.49,-1.45,-0.52,-1.14,-0.36,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
24098,-0.34,0.17,-0.34,-0.25,-0.36,-0.11,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00


### 02 Fitting Model 🧠 

In [187]:
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

pipeline = make_pipeline(preprocessor, GradientBoostingRegressor())
pipeline


In [188]:
# train pipeline
pipeline.fit(X_train, y_train)

# Score model
score = pipeline.score(X_test, y_test)
print(f'score: {round(score, 2)}')

score: 0.84


### 03 Optimising pipeline 🔍

In [189]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('pipeline', Pipeline(steps=[('feature_engineering',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('occupied_area',
                                                     FunctionTransformer(func=<function calculate_occupied_area at 0x000001EDA53BFBA0>),
                                                     ['building_gfa',
                                                      'occupancy']),
                                                    ('new_build',
                                                     FunctionTransformer(func=<function calculate_new_build at 0x000001EDA49B3740>),
                                                     ['year_built'])])),
                   ('preprocessing',
                    Col...
                                                                      StandardScaler())]),
                                                     <sklearn.compose._column_trans

In [190]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# which params are searchable
pipeline.get_params()

# define params to search
param_distributions = {
    # imputer
     'pipeline__preprocessing__num_tr__num_imputer__strategy': ['mean','median','most_frequent'],
    
    #scaler
    'pipeline__preprocessing__num_tr__scaler__with_mean': [True, False],
    'pipeline__preprocessing__num_tr__scaler__with_std': [True, False],

    # model
    # 'sgdregressor__alpha': [ 0.001, 0.0005, 0.0001],
    # 'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    # 'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None'],
    'gradientboostingregressor__n_estimators': np.arange(50, 100, 10),
    'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'gradientboostingregressor__max_depth': [3, 5, 7, 9],
    'gradientboostingregressor__min_samples_split': [2, 5, 10],
    'gradientboostingregressor__min_samples_leaf': [1, 2, 4],
    'gradientboostingregressor__subsample': [0.8, 0.9, 1.0]
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

{'pipeline__preprocessing__num_tr__scaler__with_std': False,
 'pipeline__preprocessing__num_tr__scaler__with_mean': False,
 'pipeline__preprocessing__num_tr__num_imputer__strategy': 'mean',
 'gradientboostingregressor__subsample': 0.9,
 'gradientboostingregressor__n_estimators': 70,
 'gradientboostingregressor__min_samples_split': 5,
 'gradientboostingregressor__min_samples_leaf': 2,
 'gradientboostingregressor__max_depth': 5,
 'gradientboostingregressor__learning_rate': 0.1}

In [191]:
pipeline_tuned = randomized_search.best_estimator_

# Score tuned model
tuned_score = pipeline_tuned.score(X_test, y_test)

print(f'score: {round(tuned_score, 2)}')

score: 0.81


### 04 Debugging the pipe 🐞

In [192]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()
# pipeline_tuned

dict_keys(['pipeline', 'gradientboostingregressor'])

In [193]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["pipeline"].fit_transform(X_train).shape # notice the extra columnns as a result of the ordinal encoder

Before preprocessing, X_train.shape = 
(13500, 5)
After preprocessing, X_train_preprocessed.shape = 


(13500, 19)

### 05 Exporting the pipeline as a pickle 🥒

In [194]:
import pickle

# export pipe as pickle file
with open('etl/_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_tuned, file)

In [195]:
pipe = pickle.load(open('etl/_pipeline.pkl', 'rb'))

In [196]:
X_test[4:5]

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
16284,Multifamily Housing,147000,1954,100,1


In [197]:
print(f'Power demmand : {int(pipe.predict(X_test[4:5])[0])}')

Power demmand : 779285
