---

# Utilities Pipeline

### 00 Loading Data

In [45]:
%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline, make_union, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from etl.utilities import Utilities

## Would load cleaned data using '''load.py''' once data cleaning is done
data = Utilities().get_training_data()

# create X and y
X = data.drop(columns='electricity_demmand')
y = data['electricity_demmand']

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_features = list(X_train.columns)
data.dtypes

building_typology       object
building_gfa             int64
year_built               int64
occupancy                int64
num_buildings            int64
electricity_demmand    float64
dtype: object

### 01 Preprocessor 🧮

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

# Create a custom transformer that multiplies/divides two columns
occupied_area = FunctionTransformer(lambda df: pd.DataFrame({'occupied_area': df["building_gfa"] * df["occupancy"]}), validate=False)

feature_engineering = ColumnTransformer(
    [("occupied_area", occupied_area, X_train_features)],
    remainder = "passthrough"
)

# feature_engineering.fit_transform(X_train)
# # Apply the custom transformer to X_train
# pd.DataFrame(feature_engineering.fit_transform(X_train), columns=['occupied_area'] + X_train_features)

In [48]:
cat_features = [feature for feature in data.columns if data[feature].dtype == 'object']
num_features = [feature for feature in data.columns if data[feature].dtype in ['float64', 'int64'] and feature != 'electricity_demmand']

# scale and impute numerical features
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)],
        
    remainder='passthrough')


preprocessor

In [49]:
preprocessor = FeatureUnion([
    ('preprocessor', preprocessor),
    ('feature_engineering', Pipeline([
        ('occupied_area', occupied_area),
        ('occupied_area_scaler', StandardScaler())
    ]))
])

preprocessor

In [50]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
# display(pd.DataFrame(X_train_transformed, columns=['occupied_area'] + preprocessor.get_feature_names_out()).head(5))
display(pd.DataFrame(X_train_transformed))

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' '<function occupied_area at 0x0000025E02466520>' (type <class 'function'>) doesn't

### 02 Fitting Model 🧠 

In [None]:
from sklearn.linear_model import SGDRegressor

pipeline = make_pipeline(preprocessor, SGDRegressor(random_state=13))
pipeline


In [None]:
# train pipeline
pipeline.fit(X_train, y_train)

# Score model
score = pipeline.score(X_test, y_test)
print(f'score: {round(score, 2)}')

score: 0.74


### 03 Optimising pipeline 🔍

In [None]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('featureunion',
   FeatureUnion(transformer_list=[('preprocessor',
                                   ColumnTransformer(remainder='passthrough',
                                                     transformers=[('num_transformer',
                                                                    Pipeline(steps=[('imputer',
                                                                                     SimpleImputer()),
                                                                                    ('scaler',
                                                                                     StandardScaler())]),
                                                                    ['building_gfa',
                                                                     'year_built',
                                                                     'occupancy',
                                                                     'num_buildings']),
 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# which params are searchable
pipeline.get_params()

# define params to search
param_distributions = {
    # imputer
     'featureunion__preprocessor__num_transformer__imputer__strategy': ['mean','median','most_frequent'],
    
    #scaler
    'featureunion__preprocessor__num_transformer__scaler__with_mean': [True, False],
    'featureunion__preprocessor__num_transformer__scaler__with_std': [True, False],

    # model
    # 'sgdregressor__alpha': [ 0.001, 0.0005, 0.0001],
    # 'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    # 'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None'],
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

{'featureunion__preprocessor__num_transformer__scaler__with_std': True,
 'featureunion__preprocessor__num_transformer__scaler__with_mean': True,
 'featureunion__preprocessor__num_transformer__imputer__strategy': 'most_frequent'}

In [None]:
pipeline_tuned = randomized_search.best_estimator_

# train tuned pipeline
pipeline_tuned.fit(X_train, y_train)

# Score tuned model
tuned_score = pipeline_tuned.score(X_test, y_test)

print(f'score: {round(tuned_score, 2)}')

score: 0.74


### 04 Debugging the pipe 🐞

In [None]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()
# pipeline_tuned

dict_keys(['featureunion', 'sgdregressor'])

In [None]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["featureunion"].fit_transform(X_train).shape # notice the extra columnns as a result of the ordinal encoder

Before preprocessing, X_train.shape = 
(13500, 5)
After preprocessing, X_train_preprocessed.shape = 


(13500, 18)

### 05 Exporting the pipeline as a pickle 🥒

In [None]:
import pickle

# export pipe as pickle file
with open('etl/_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_tuned, file)


PicklingError: Can't pickle <function <lambda> at 0x0000025E5EE66FC0>: attribute lookup <lambda> on __main__ failed

In [None]:
pipe = pickle.load(open('etl/_pipeline.pkl', 'rb'))

EOFError: Ran out of input

In [None]:
X_test[4:5]

Unnamed: 0,building_typology,building_gfa,year_built,occupancy,num_buildings
24958,Multifamily Housing,195000,1951,100,1


In [None]:
print(f'Power demmand : {int(pipe.predict(X_test[4:5])[0])}')

Power demmand : 9988496
