---

# Utilities Pipeline

### 01 Preprocessor

In [40]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Would load cleaned data using '''load.py''' once data cleaning is done
data = pd.read_csv('data/raw/csv/data_ext_nyc.csv', low_memory=False)

# create X and y
X = data.drop(columns='Electricity Use - Grid Purchase (kWh)')
y = data['Electricity Use - Grid Purchase (kWh)'].fillna(0)

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [45]:
num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder(handle_unknown='ignore',)

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic) # can add new column feature here too
preproc_full

### 02 Fitting Model 

In [50]:
from sklearn.linear_model import SGDRegressor

pipeline = make_pipeline(preproc_full, SGDRegressor()))
pipeline

# train pipeline
pipeline.fit(X_train, y_train)

# Score model
print(f'score: {pipeline.score(X_test,y_test)}')
pipeline

# Make predictions
print(f'prediction: {pipeline.predict(X_test.iloc[0:1])}')


score: -9.311187681956296e+29
prediction: [-3.47267854e+21]


### 03 Optimising pipeline

In [54]:
# which params are searchable
pipeline.get_params()

{'memory': None,
 'steps': [('featureunion',
   FeatureUnion(transformer_list=[('columntransformer',
                                   ColumnTransformer(remainder='passthrough',
                                                     transformers=[('pipeline',
                                                                    Pipeline(steps=[('simpleimputer',
                                                                                     SimpleImputer()),
                                                                                    ('standardscaler',
                                                                                     StandardScaler())]),
                                                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000001EDCA770950>),
                                                                   ('onehotencoder',
                                                                    OneHotEncoder(ha

In [61]:
from sklearn.model_selection import RandomizedSearchCV

# which params are searchable
pipeline.get_params()

# define params to search 
param_distributions = {
    'featureunion__columntransformer__pipeline__simpleimputer__strategy': ['mean','median','most_frequent'],
    'sgdregressor__alpha': [0.0001,0.001, 0.01, 0.1],
    'sgdregressor__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', 'None']
}

randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='r2'
)

randomized_search.fit(X_train,y_train)
randomized_search.best_params_

5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 1582, in fit
    self._validate_params()
  File "c:\Users\hramzan\AppData\Local\anaconda3\Lib\site-packages\sklearn\

{'sgdregressor__penalty': 'elasticnet',
 'sgdregressor__loss': 'huber',
 'sgdregressor__alpha': 0.001,
 'featureunion__columntransformer__pipeline__simpleimputer__strategy': 'most_frequent'}