<a href="https://colab.research.google.com/github/jeevan97achar/housing_ML_project/blob/main/Housing_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering with in house software

We will set up all the feature engineering steps within a Scikit-learn pipeline utilizing the open source transformers plus those we develop in house.

In [1]:
! pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2


In [3]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import preprocessors as pp

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [None]:
# load dataset
data = pd.read_csv('train.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

In [5]:
# Cast MSSubClass as object

data['MSSubClass'] = data['MSSubClass'].astype('O')

## Separate dataset into train and test
It is important to separate our data intro training and testing set.

When we engineer features, some techniques learn parameters from data. It is important to learn these parameters only from the train set. This is to avoid over-fitting.

Our feature engineering techniques will learn:

* mean
* mode
* exponents for the yeo-johnson
* category frequency
* and category to number mappings

from the train set.

Separating the data into train and test involves randomness, therefore, we need to set the seed.

In [6]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1), # predictive variables
    data['SalePrice'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((1314, 79), (146, 79))

## Target
We apply the logarithm

In [7]:
y_train = np.log(y_train)
y_test = np.log(y_test)

## Config

In [8]:
# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['MasVnrType',
                                     'BsmtQual',
                                     'BsmtCond',
                                     'BsmtExposure',
                                     'BsmtFinType1',
                                     'BsmtFinType2',
                                     'Electrical',
                                     'GarageType',
                                     'GarageFinish',
                                     'GarageQual',
                                     'GarageCond']


CATEGORICAL_VARS_WITH_NA_MISSING = [
    'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


TEMPORAL_VARS = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
REF_VAR = "YrSold"


# variables to log transform
NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]

NUMERICALS_YEO_VARS = ['LotArea']


BINARIZE_VARS = [
    'BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'MiscVal'
]

# variables to map
QUAL_VARS = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
             'HeatingQC', 'KitchenQual', 'FireplaceQu',
             'GarageQual', 'GarageCond',
             ]

EXPOSURE_VARS = ['BsmtExposure']

FINISH_VARS = ['BsmtFinType1', 'BsmtFinType2']

GARAGE_VARS = ['GarageFinish']

FENCE_VARS = ['Fence']

# categorical variables to encode
CATEGORICAL_VARS = [
    'MSZoning',
    'Street',
    'Alley',
    'LotShape',
    'LandContour',
    'Utilities',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Foundation',
    'Heating',
    'CentralAir',
    'Electrical',
    'Functional',
    'GarageType',
    'PavedDrive',
    'PoolQC',
    'MiscFeature',
    'SaleType',
    'SaleCondition',
    'MSSubClass']


QUAL_MAPPINGS = {'Po': 1, 'Fa': 2, 'TA': 3,
                 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}

EXPOSURE_MAPPINGS = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}

FINISH_MAPPINGS = {'Missing': 0, 'NA': 0, 'Unf': 1,
                   'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}

GARAGE_MAPPINGS = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

FENCE_MAPPINGS = {'Missing': 0, 'NA': 0,
                  'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}

## Pipeline - Feature engineering

In [9]:
# set up the pipeline
price_pipe = Pipeline([

    # ===== IMPUTATION =====
    # impute categorical variables with string missing
    ('missing_imputation', CategoricalImputer(
        imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)),

    ('frequent_imputation', CategoricalImputer(
        imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)),

    # add missing indicator
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),

    # impute numerical variables with the mean
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA
    )),
    
    
    # == TEMPORAL VARIABLES ====
    ('elapsed_time', pp.TemporalVariableTransformer(
        variables=TEMPORAL_VARS, reference_variable=REF_VAR)),

    ('drop_features', DropFeatures(features_to_drop=[REF_VAR])),

   

    # ==== VARIABLE TRANSFORMATION =====
    ('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    ('yeojohnson', YeoJohnsonTransformer(variables=NUMERICALS_YEO_VARS)),
    
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)),
    

    # === mappers ===
    ('mapper_qual', pp.Mapper(
        variables=QUAL_VARS, mappings=QUAL_MAPPINGS)),

    ('mapper_exposure', pp.Mapper(
        variables=EXPOSURE_VARS, mappings=EXPOSURE_MAPPINGS)),

    ('mapper_finish', pp.Mapper(
        variables=FINISH_VARS, mappings=FINISH_MAPPINGS)),

    ('mapper_garage', pp.Mapper(
        variables=GARAGE_VARS, mappings=GARAGE_MAPPINGS)),
    
    ('mapper_fence', pp.Mapper(
        variables=FENCE_VARS, mappings=FENCE_MAPPINGS)),


    # == CATEGORICAL ENCODING
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS
    )),

    # encode categorical and discrete variables using the target mean
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
])

In [10]:
# train the pipeline
price_pipe.fit(X_train, y_train)

  loglike = -n_samples / 2 * np.log(trans.var(axis=0))
  w = xb - ((xb - xc) * tmp2 - (xb - xa) * tmp1) / denom
  tmp1 = (x - w) * (fx - fv)
  tmp2 = (x - v) * (fx - fw)


Pipeline(steps=[('missing_imputation',
                 CategoricalImputer(variables=['Alley', 'FireplaceQu', 'PoolQC',
                                               'Fence', 'MiscFeature'])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['MasVnrType', 'BsmtQual',
                                               'BsmtCond', 'BsmtExposure',
                                               'BsmtFinType1', 'BsmtFinType2',
                                               'Electrical', 'GarageType',
                                               'GarageFinish', 'GarageQual',
                                               'GarageCon...
                 OrdinalEncoder(variables=['MSZoning', 'Street', 'Alley',
                                           'LotShape', 'LandContour',
                                           'Utilities', 'LotConfig',
                                           'Lan

In [11]:
X_train = price_pipe.transform(X_train)
X_test = price_pipe.transform(X_test)

In [12]:
# check absence of na in the train set
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

In [13]:
# check absence of na in the test set
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]


[]

In [14]:
# the parameters are learnt and stored in each step
# of the pipeline

price_pipe.named_steps['frequent_imputation'].imputer_dict_

{'MasVnrType': 'None',
 'BsmtQual': 'TA',
 'BsmtCond': 'TA',
 'BsmtExposure': 'No',
 'BsmtFinType1': 'Unf',
 'BsmtFinType2': 'Unf',
 'Electrical': 'SBrkr',
 'GarageType': 'Attchd',
 'GarageFinish': 'Unf',
 'GarageQual': 'TA',
 'GarageCond': 'TA'}

In [15]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
930,9,3,4.290459,0.079663,1,2,1,3,1,0,0,19,2,1,3,3,8,5,2,2,0,0,10,10,1,0.0,4,3,4,4,3,3,6,16,1,0,1450,1466,2,5,1,3,7.290293,0,0,7.290293,0,0,2,0,3,1,4,7,4,0,0,3,2.0,3,3,610,3,3,2,100,18,0,0,0,0,0,0,2,0,7,2,3,0,0,0
656,9,3,4.276666,0.079663,1,2,1,1,1,0,0,8,2,1,3,3,5,7,49,2,0,0,6,6,2,54.0,4,3,2,3,3,1,5,806,1,0,247,1053,2,5,1,3,6.959399,0,0,6.959399,1,0,1,1,3,1,4,5,4,0,0,3,49.0,2,1,312,3,3,2,0,0,0,0,0,0,0,3,2,0,8,2,3,0,0,0
45,11,3,4.110874,0.079663,1,2,0,1,1,0,0,21,2,1,4,3,9,5,5,5,2,0,3,2,2,412.0,5,3,4,5,3,1,6,456,1,0,1296,1752,2,5,1,3,7.468513,0,0,7.468513,1,0,2,0,2,1,5,6,4,1,4,3,5.0,2,2,576,3,3,2,196,82,0,0,0,0,0,0,2,0,2,2,3,0,0,0
1348,9,3,4.246776,0.079663,1,2,2,2,1,0,0,10,2,1,3,3,7,5,9,9,0,0,10,10,1,0.0,4,3,4,4,3,4,6,1443,1,0,39,1482,2,5,1,3,7.309212,0,0,7.309212,1,0,2,0,3,1,4,5,4,1,2,3,9.0,2,2,514,3,3,2,402,25,0,0,0,0,0,0,2,0,8,2,3,1,0,0
55,9,3,4.60517,0.079663,1,2,1,1,1,0,0,8,2,1,3,3,6,5,44,44,0,0,6,7,2,272.0,3,3,2,3,3,1,4,490,1,0,935,1425,2,4,1,3,7.261927,0,0,7.261927,0,0,2,0,3,1,3,7,4,1,4,3,44.0,2,2,576,3,3,2,0,0,0,1,0,0,0,0,2,0,7,2,3,0,0,0
