## Global Imports

In [324]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pprint import pprint
from sklearn.pipeline import TransformerMixin,Pipeline,FeatureUnion,make_pipeline,make_union
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelEncoder,FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

sns.set()


def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)



In [325]:
raw_train_data = pd.read_csv('data/train.csv')
print(f'Train Dataset size = {raw_train_data.shape[0]}')
raw_test_data = pd.read_csv('data/test.csv')
print(f'Test Dataset size = {raw_test_data.shape[0]}')
IDS = raw_test_data.Id

# Convert SalePrice to log value
raw_train_data.SalePrice = np.log(raw_train_data.SalePrice)

Train Dataset size = 1460
Test Dataset size = 1459


## Creating Pipeline

+ Pipelines solves some major issues relating pre-processing data for consumption of model. Often we end up with notebooks which have messy structure for preprocessing, in worst case we might end up leaking our data, as its a common pratice to merge train and test data to simplify preprocessing - This strategy isn't scalable, and falls apart fairly quick.

+ Our pipeline will manage continuous as well as categorical data, We indend to, 
    - encoding categorical data 
    - optionally provide One Hot Encoding (not implemented yet)
    - Treat missing values
        - For categorical, use most repeated values or simply replace them with a comman label
        - For Continuous, either use median value
    - And more, but this notebook will restrict to most common preprocessing steps.

#### Overview

+ Our **Final Pipeline**, will consist of a **Feature Union** which would compose **Continuous and Categorical Pipelines**.


### Creating Custom Transformers

---

#### We create following transformers:

+ **CategoricalDataFilter**
    - retrieving Categorical features

+ **ContinuousDataFilter**
    - retrieving Continuous feature

+ **CategoricalMapper**
    - encoding categorical feature values to integer space

+ **MedianImputer**
    - filling missing features with median values of each feature.
    
---

#### Why we need custom transformer?

Custom Transformer allow us to store feature specific values during fit, these can be used in transform and eventually this will ensure that data leak does not occur. 

Eg. We use median values extracted from data which is used for fitting, and use the same in transforming


#### Note: We have used **FunctionTransformer**, basically its a convinence function in which we don't have to provide logic for fit, we can tranform the data and return the transformed data, it would act just like a custom transformer.

In [326]:
class CategoricalDataFilter(TransformerMixin,BaseEstimator):
    """Transformer for extracting categorical features, based on number of unique values present in a feature"""
    
    def __init__(self,max_unique=26):
        """
            Parameters:
            ----------
            max_unique: maximum unique values permissable for feature to be considered as Categorical feature 
        """        
        self.max_unique = max_unique        
        # capture categorical feature names, return these features in transform.
        self.categorical_features = []
        
    def fit(self,X,y=None):                
        assert type(X) == pd.DataFrame

        for feature in X.columns:
            n_unique = X[feature].nunique()
            # Categorical Feature only if n_unqiue is less then max_unique
            if n_unique <= self.max_unique: 
                self.categorical_features.append(feature)        
        
        return self
    
    def transform(self,X,y=None):        
        return X[self.categorical_features]
        

class ContinuousDataFilter(BaseEstimator,TransformerMixin):
    """
        Transformer for extracting continuous features, based on number of unique values present in a feature
    """
        
    def __init__(self,min_unique=26):  
        """
            parameters:
            -----------
            min_unique: minimum number of unique values a feature should have to be considered continuous type
        """
        self.min_unique = min_unique        
        self.continuous_features = []
        
    def fit(self,X,y=None):                
        assert type(X) == pd.DataFrame

        for feature in X.columns:
            n_unique = X[feature].nunique()
            if n_unique > self.min_unique:
                self.continuous_features.append(feature)        
        
        return self
    
    def transform(self,X,y=None):        
        return X[self.continuous_features].astype('category')        
    
        

    
    
    
cat_df = CategoricalDataFilter().fit(raw_train_data).transform(raw_train_data)

cont_df = ContinuousDataFilter().fit(raw_train_data).transform(raw_train_data)

display(len(cat_df.columns))

display(len(cont_df.columns))

display(cat_df.columns)

61

20

Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', 'LowQualFinSF', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageQual',
       'GarageCond', 'PavedDrive', '3SsnPorch', 'PoolArea', 'PoolQC', 'Fence',
       'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition'],
      dtype='object')

In [327]:
class CategoricalMapper(BaseEstimator,TransformerMixin):
    """
        Transformer for handling categorical value mapping, It ensures that the categories found in 
        fitting are only used for transforming, hence avoiding data leak. 
        
        Any new feature value found in transform would be mapped to 'unkown' i.e. -1 nominal value
    """
    def __init__(self):        
        
        # Preserve the mapping from feature values to encoded values.
        self.label_encoders = {}        
        
        
    def fit(self,X,y=None):        
        """ Encode the feature values"""
        assert type(X) == pd.DataFrame
        

        for feature in X:        
            feature_values = X[feature].value_counts().index.values
            label_dict = dict()
            for idx,label in enumerate(feature_values): 
                label_dict[label] = idx
            label_dict['unkown'] = -1 
            self.label_encoders[feature] = label_dict

        return self
    
    def transform(self,X,y=None):
        assert type(X) == pd.DataFrame
        X_copy = X.copy()
        for feature_name in X.columns:
            encoder = self.label_encoders[feature_name]
            for v in X[feature_name].value_counts().index.values:
                if(v not in encoder.keys()):
                    X_copy.replace(v,-1,inplace=True)
                else:
                    X_copy.replace(v,encoder[v],inplace=True)
        return X_copy
            
def FillNaWithMissing(X):    
    """
        Applicable only on categorical data.
        
        Replace 'na' values with 'missing' string, application only for categorical variables
    """
    res = X.copy()
    for fn in X.columns:    
        res[fn] = X[fn].astype('object').fillna('Missing')
    return res


CategoricalMapper().fit_transform(FillNaWithMissing(cat_df))


Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,3SsnPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,7,5,5,5,5,5,5,5,5,7,...,5,5,5,5,5,5,5,3,0,0
1,5,5,5,5,5,5,5,10,5,22,...,5,5,5,5,5,5,5,1,0,0
2,7,5,5,5,7,5,5,5,5,7,...,5,5,5,5,5,5,5,3,0,0
3,7,5,5,5,7,5,5,7,5,10,...,5,5,5,5,5,5,5,2,0,2
4,7,5,5,5,7,5,5,10,5,13,...,5,5,5,5,5,5,8,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,10,1,0,0
1456,5,5,5,5,5,5,5,5,5,10,...,5,5,5,10,5,5,5,4,0,0
1457,7,5,5,5,5,5,5,5,5,10,...,5,5,5,11,5,15,5,4,0,0
1458,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,10,4,0,0


In [328]:

class MedianImputer(BaseEstimator,TransformerMixin):
    """
        Transformer to impute missing or na fields with median value of the feature.
        
        Uses in SimplyImputer of sklearn internally
        
        Note: Only Numeric data is allowed
    """
    
    def __init__(self):
        # Store median values of feature in fit, used while transforming
        self.features = []
        self.imputer = None
        
    def fit(self,X,y=None):
        assert type(X) == pd.DataFrame
        
        self.imputers = SimpleImputer(strategy='median').fit(X)
        self.features = X.columns
            
        return self
    
    def transform(self,X,y=None):        
        assert type(X) == pd.DataFrame
        
        for x_feature in X.columns:
            if x_feature not in self.features:
                raise Exception("Feature not found")
            
        return self.imputers.transform(X)

## Final Assembling of Pipelines

#### A short description about functions we about to see.

##### ```make_pipeline```

* It Constructs a Pipeline from the given transformers/estimators.
* This is a shorthand for the Pipeline constructor. 
* make_pipeline internally maps names of tansformers as their lowercase form
* Usual form of Pipeline function is:

```python
Pipeline(steps=[('standardscaler',StandardScaler()),
                ('gaussiannb', GaussianNB())])

```

* In a Pipeline, a estimator is the last item in transformer list. If your using a estimator, make sure that it's the occupies the last place. 

* In simplified version, a pipeline simply serialize all transformers in the given order and fit that to the estimator in the end. However is a estimator is not present then it would simply return the last transformers output.

* We made some multiple Pipeline object, two with only tranformers which are Continuous and Categorical transformer, and last one is the final pipeline which feeds the final neat data to our estimator.

#####  ```make_union```

* Similar to make_pipeline, This is a convinience function for Constructing a FeatureUnion from the given transformers.
* This is a shorthand for the FeatureUnion constructor

* A FeatureUnion composes output of independent transformers, i.e. every transformer runs in parallel.

* We use FeatureUnion to compose output of Continoues and Categorical pipelines



In [329]:
train_data = raw_train_data.drop('SalePrice',axis=1)
output_y = raw_train_data.SalePrice

X_train,X_test,y_train,y_test = train_test_split(train_data,output_y,test_size=0.2)

categorical_pipeline = make_pipeline(CategoricalDataFilter(),
                             FunctionTransformer(FillNaWithMissing,validate=False),
                             CategoricalMapper()) 

continuous_pipeline = make_pipeline(ContinuousDataFilter(),
                             MedianImputer())


feature_union = make_union(continuous_pipeline,categorical_pipeline)

model = RandomForestRegressor()
final_pipeline = make_pipeline(feature_union,model)

final_pipeline.fit(X_train,y_train)
final_pipeline.score(X_test,y_test)



0.7904458193861086