---

# Utilities Pipeline

### 01 Preprocessor

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Would load cleaned data using '''load.py''' once data cleaning is done
data = pd.read_csv('data/raw/csv/data_ext_nyc.csv', low_memory=False)

# create X and y
X = data.drop(columns='Electricity Use - Grid Purchase (kWh)')
y = data['Electricity Use - Grid Purchase (kWh)'].fillna(0)

# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [37]:
num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic)
preproc_full

### 02 Fitting Model 

In [38]:
from sklearn.linear_model import SGDRegressor

pipeline = make_pipeline(preproc_full, SGDRegressor())
pipeline

# train pipeline
pipeline.fit(X_train, y_train)

