# Feature Engineering

In [None]:
import pickle

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv('../data/housing/housing_train.csv', index_col=0)
df.head(3)

In [None]:
# inspect all columns
# df.info()

In [None]:
train, val = train_test_split(df, test_size=0.2, random_state=42)

Xtrain = train.iloc[:,:-1] # all but last
ytrain = train.iloc[:,-1]

Xval = val.iloc[:,:-1] # all but last
yval = val.iloc[:,-1]

## Why Feature Engineering?

**We want to make our data easy for the model to understand.**

* some features are strings
* some features have weird distributions
* some features are redundant
* a dataset might consist of multiple subsets worth dividing
* linear models can discern by many features, but every feature has only one coefficient
* more features -> more information -> better predictions

Goal: non-redundant, clearly distributed, numerical features

### Feature Engineering with ColumnTransformers and Pipelines

In [None]:
column_trans = ColumnTransformer([
    
    # ('name', object, column names)
    ('cat-to-binary', OneHotEncoder(sparse=False), ['Street', 'Utilities', 'LotShape']),
    ('binning', KBinsDiscretizer(n_bins=5, encode='onehot'), ['LotArea']),  # 'LotArea' should be encoded into 5 bins:
    ('do nothing', 'passthrough', ['OverallQual', 'YrSold']),
    
])

In [None]:
column_trans.fit(Xtrain)
Xt = column_trans.transform(Xtrain).astype(int)  # --> numpy array (data points x features)
Xt[0]

In [None]:
pipe = make_pipeline(
    
    column_trans,
    PolynomialFeatures(interaction_only=True, include_bias=False),  # calculates interaction terms
    MinMaxScaler(),
    
    LinearRegression()  # <-- starting point, for better statistics (p-values) use statsmodels
)

In [None]:
pipe.fit(Xtrain, ytrain)
ypred = pipe.predict(Xtrain)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(ytrain, ypred)

In [None]:
ypred_val = pipe.predict(Xval)
mean_squared_error(yval, ypred_val)