<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Custome-transformers" data-toc-modified-id="Custome-transformers-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Custome transformers</a></span></li><li><span><a href="#Pipeline-creation" data-toc-modified-id="Pipeline-creation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Pipeline creation</a></span></li><li><span><a href="#Execution-of-pipeline" data-toc-modified-id="Execution-of-pipeline-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Execution of pipeline</a></span></li></ul></div>

## Imports

In [45]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

%matplotlib inline

In [2]:
cwd = os.getcwd()
data_location = r"C:/Study/IUMSDS/Spring2020/I526_Applied_ML/Project/dataset/"
datafile = "previous_application.csv"

df_prev_app = pd.read_csv(os.path.join(data_location, datafile))

## Custome transformers

In [94]:
class DfFeatureAdder(BaseEstimator, TransformerMixin):
    """
    FeatureAdder class for 'previous_applications' dataset of HCDR project. Added features:
    1) TIME: Early_morning [0-8], morning[9-12], afternoon[13-17], evening[18-23]  
    2) CREDIT_TO_APP_RATIO: AMT_CREDIT/AMT_APPLICATION 
    3) APPLIED_EXTRA: AMT_APPLICATION - AMT_GOODS_PRICE 
    4) WEEKDAY: mon-tue STARTofWK, wed-thu-fri MidWk, sat-sun EndWk

    Input: pd.DataFrame 
    Output: pd.DataFrame with Selected and newly created features 
    """
    
    def __init__(self, selected_features):
        self.selected_features = selected_features
        
    def fit(self, X, y=None):
        """
        Input X: input dataframe
        Input y: predicted feature, default=None
        Output : class instance
        
        No action needed by `fit`. Returns the class instance. 
        """
        return self
    
    def transform(self, X):
        """
        Input X: input dataframe
        Output : dataframe with selected and engineered features
        """
  
        try:
            
            out_df = X[self.selected_features]
            
            out_df.loc[out_df['AMT_APPLICATION'] == 0, 'AMT_CREDIT'] = 0 
            out_df['CREDIT_TO_APP_RATIO'] = out_df['AMT_CREDIT'] / out_df['AMT_APPLICATION']

            out_df['APPLIED_EXTRA'] = '0'
            out_df.loc[out_df['AMT_APPLICATION'] > out_df['AMT_GOODS_PRICE'], 'APPLIED_EXTRA'] = '1'

            out_df['WEEKDAY'] = out_df['WEEKDAY_APPR_PROCESS_START'].map({'MONDAY':'START','TUESDAY':'START','WEDNESDAY':'MID','THURSDAY':'MID','FRIDAY':'MID','SATURDAY':'END','SUNDAY':'END'})
            out_df['TIME'] = out_df['HOUR_APPR_PROCESS_START'].map({0:'EARLY_MORN', 1:'EARLY_MORN', 2:'EARLY_MORN', 3:'EARLY_MORN', 4:'EARLY_MORN', 5:'EARLY_MORN', 6:'EARLY_MORN', 7:'EARLY_MORN', 8:'EARLY_MORN', 9:'MORN', 10:'MORN', 11:'MORN', 12:'MORN', 13:'AFTERNOON', 14:'AFTERNOON', 15:'AFTERNOON', 16:'AFTERNOON', 17:'AFTERNOON', 18:'EVENING', 19:'EVENING', 20:'EVENING', 21:'EVENING', 22:'EVENING', 23:'EVENING'})

            out_df.drop(['AMT_GOODS_PRICE','WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START'], axis=1, inplace=True)
            
            msg = f"\n{'='*100}\nDfFeatureAdder.transform: Success! Transformed DF of {X.shape} to {out_df.shape} shape.\nFeature list: {list(out_df.columns)}\n{'='*100}\n"
            print(f"\033[92m {msg}\033[00m")
            
            return out_df
        except Exception as e:
            msg = f"\n{'='*100}\nDfFeatureAdder.transform: Excpetion occurred. Please check - {e}\n{'='*100}\n"
            print(f"\033[91m {msg}\033[00m")
            return None

        
class DfImputer(BaseEstimator, TransformerMixin):
    """
    Based on: sklearn.impute.SimpleImputer
    Input: pd.DataFrame of numerical or categorical features
    Output: pd.DataFrame
    
    This class uses sklearn SimpleImputer to replace nulls with values indicated by 'strategy'
    """
    
    def __init__(self, feature_type, feature_list=None, impute_strategy=None, impute_value=None):
        from sklearn.impute import SimpleImputer
        self.feature_type = feature_type
        self.feature_list = feature_list
        self.impute_strategy = impute_strategy
        self.impute_value = impute_value
        
    def fit(self, X, y=None):
        """
        Input X: input dataframe
        Input y: predicted feature, default=None
        Output : class instance
        
        No action needed by `fit`. Returns the class instance. 
        """
        return self
    
    def transform(self, X):
        """
        Input X: input dataframe
        Output : Imputes null values using sklearn.impute.SimpleImputer and returns a pandas dataframe
        """              
        key_attribs = ['SK_ID_CURR','SK_ID_PREV']
        if not self.feature_list:
            if self.feature_type == "cat":
                self.feature_list = list(X.select_dtypes(include='object').columns)
            elif self.feature_type == "num":
                self.feature_list = list(X.select_dtypes(exclude='object').columns)
        self.feature_list = [i for i in self.feature_list if i not in key_attribs]
              
        np_array = X[self.feature_list].values
        if not self.impute_strategy:
            if self.feature_type == "cat":
                self.impute_strategy = "constant"
            elif self.feature_type == "num":
                self.impute_strategy = "mean"
            else:
                print("transform(): invalid feature_type - ", self.feature_type)
                self.impute_strategy = "mean"
        
        print(f"Using {self.impute_strategy} on {self.feature_list}")
        
        if not self.impute_value:
            if self.feature_type == "cat":
                self.impute_value = "missing"
            elif self.feature_type == "num":
                self.impute_value = None
            else:
                print("transform(): invalid feature_type - ", self.feature_type)
                self.impute_value = None
            
        try:
            out_df = pd.DataFrame(SimpleImputer(strategy=self.impute_strategy, fill_value=self.impute_value).fit_transform(np_array), 
                                columns=self.feature_list)
            
            msg = f"\n{'='*100}\nDfImputer.transform: Success! Imputed a DF of {X.shape} to {out_df.shape} shape.\nFeature list: {out_df.columns}\n{'='*100}\n"
            print(f"\033[92m {msg}\033[00m")

            return out_df
        except Exception as e:
            msg = f"\n{'='*100}\nDfImputer.transform: Excpetion occurred. Please check - {e}\n{'='*100}\n"
            print(f"\033[91m {msg}\033[00m")
            return None
        
class DfScaler(BaseEstimator, TransformerMixin):
    """
    Based on: sklearn.preprocessing.StandardScaler
    Input: pd.DataFrame of numerical features
    Output: scaled pd.DataFrame
    
    This class uses sklearn StandardScaler to standardise the features of input data frame
    """
    
    def __init__(self):
        from sklearn.preprocessing import StandardScaler
        
    def fit(self, X, y=None):
        """
        Input X: input dataframe
        Input y: predicted feature, default=None
        Output : class instance
        
        No action needed by `fit`. Returns the class instance. 
        """
        return self
    
    def transform(self, X):
        """
        Input X: input dataframe
        Output : Pandas dataframe scaled using sklearn StandardScaler
        """
#         print("CALLING DfScaler TRANSFORM: ***********************", X.shape)      
        
        try:
            columns = X.columns
            np_array = X.values
            
            out_df = pd.DataFrame(StandardScaler().fit_transform(np_array), 
                                  columns=columns)
            
            msg = f"\n{'='*100}\nDfScaler.transform: Success! Scaled a DF of {X.shape} to {out_df.shape} shape.\nFeature list: {out_df.columns}\n{'='*100}\n"
            print(f"\033[92m {msg}\033[00m")

            return out_df
        except Exception as e:
            msg = f"\n{'='*100}\nDfScaler.transform: Excpetion occurred. Please check - {e}\n{'='*100}\n"
            print(f"\033[91m {msg}\033[00m")
            return None
        
class DfOneHotEncoder(BaseEstimator, TransformerMixin):
    """
    Based on: sklearn.preprocessing.OneHotEncoder
    Input: pd.DataFrame of categorical features
    Output: One hot encoded pd.DataFrame
    
    This class uses sklearn OneHotEncoder to one hot encode features of input data frame and create new features with 1/0 binary values.
    It uses - OneHotEncoder(sparse=False, handle_unknown="ignore")
    """
    
    def __init__(self):
        from sklearn.preprocessing import OneHotEncoder
        
    def fit(self, X, y=None):
        """
        Input X: input dataframe
        Input y: predicted feature, default=None
        Output : class instance
        
        No action needed by `fit`. Returns the class instance. 
        """
        return self
    
    def transform(self, X):
        """
        Input X: input dataframe
        Output : One hot encoded Pandas dataframe created using sklearn OneHotEncoder
        """
              
        try:
            columns = X.columns
            np_array = X.values
            
            ohe = OneHotEncoder(sparse=False, handle_unknown="ignore", dtype=np.int)
            
            out_df = pd.DataFrame(ohe.fit_transform(np_array), 
                                  columns=ohe.get_feature_names(input_features=columns))
            
            msg = f"\n{'='*100}\nDfOneHotEncoder.transform: Success! Returned a DF of {out_df.shape} shape.\n{out_df.columns}\n{'='*100}\n"
            print(f"\033[92m {msg}\033[00m")
            
            return out_df
        except Exception as e:
            msg = f"\n{'='*100}\nDfOneHotEncoder.transform: Excpetion occurred. Please check - {e}\n{'='*100}\n"
            print(f"\033[91m {msg}\033[00m")
            return None
        
class DfAggregator(BaseEstimator, TransformerMixin):
    """
    Aggregator class for 'previous_applications' dataset of HCDR project
    
    Input: pd.DataFrame 
    Output: aggregated pd.DataFrame
    """
    
    def __init__(self, key_attrib='SK_ID_CURR', count_attrib='SK_ID_PREV'):
        self.count_attrib= count_attrib
        self.key_attrib  = key_attrib
        
    def fit(self, X, y=None):
        """
        Input X: input dataframe
        Input y: predicted feature, default=None
        Output : class instance
        
        No action needed by `fit`. Returns the class instance. 
        """
        return self
    
    def transform(self, X):
        """
        Input X: input dataframe
        Output : aggregated dataframe
        """
              
        try:
                       
            X = pd.concat([df_prev_app[[self.key_attrib, self.count_attrib]], X], axis=1, sort=False)
            
            key_attribs = ['SK_ID_CURR','SK_ID_PREV']
            self.cat_attribs = list(X.select_dtypes(include='object').columns)
            self.cat_attribs = [i for i in self.cat_attribs if i not in key_attribs]
            
            self.num_attribs = list(X.select_dtypes(exclude='object').columns)
            self.num_attribs = [i for i in self.num_attribs if i not in key_attribs]
        
            self.dict_variable = {key:['mean'] for key in self.num_attribs}    
            self.dict_variable.update({key:['sum'] for key in self.cat_attribs})
            self.dict_variable[self.count_attrib] = ['size']
#             print(self.dict_variable)
            
            out_df = X.groupby(by=self.key_attrib).agg(self.dict_variable)

            out_df.columns = out_df.columns.droplevel(1)

            out_df.reset_index(inplace=True)
           
            out_df.columns = ['COUNT_PREV_APP' if x=='SK_ID_PREV' else x for x in out_df.columns]
        
            msg = f"\n{'='*100}\nDfAggregator.transform: Success! Aggregated a DF of {X.shape} shape to {out_df.shape}.\nFeature list: {out_df.columns}\n{'='*100}\n"
            print(f"\033[92m {msg}\033[00m")

            return out_df
        except Exception as e:
            msg = f"\n{'='*100}\nDfAggregator.transform: Excpetion occurred. Please check - {e}\n{'='*100}\n"
            print(f"\033[91m {msg}\033[00m")
            return None        
        
class DfFeatureUnion(BaseEstimator, TransformerMixin):
    """
    ### This class isn't working as expected. Please use PandasFeatureUnion instead. ###
    
    This class runs multiple pipelines sequentially and retuns the output in pandas dataframe format
    
    Input-
    transformer_list: List of transformers
    X: input dataframe
    Output-
    Pandas dataframe
    """
    def __init__(self, transformer_list):
        self.transformer_list = transformer_list
        
    def fit(self, X, y=None, key_cols=None):
        """
        Input X: input dataframe
        Input y: predicted feature, default=None
        Output : class instance
        
        No action needed by `fit`. Returns the class instance. 
        """
        return self
        
    def transform(self, X, key_cols=None):
        """
        Execute each pipeline and concatenate outputs of each pipeline
        """
        if key_cols:
            self.df_out = X[key_cols]
        else:
            self.df_out = pd.DataFrame()
              
        for transformer in self.transformer_list:
            self.df_out = pd.concat([self.df_out, transformer[1].fit_transform(X)], axis='columns')
#         self.df_out = pd.concat([self.df_out, X], axis='columns')
    
        msg = f"\n{'='*100}\nDfFeatureUnion.transform: Success! Returned a DF of {self.df_out.shape} shape.\n{self.df_out.columns}\n{'='*100}\n"
        print(f"\033[92m {msg}\033[00m")    
            
        return self.df_out
    
    def fit_transform(self, X, key_cols=None):
        """
        Custom fit_transform method
        """
        return self.transform(X, key_cols)
    
    
# Source: https://zablo.net/blog/post/pandas-dataframe-in-scikit-learn-feature-union/
import numpy as np
import pandas as pd
from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one
from scipy import sparse

class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            # this handles df
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            # this handles df
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

## Pipeline creation

In [93]:
def create_pipelines(df, key_attribs=['SK_ID_CURR', 'SK_ID_PREV'], verbose=False):
    num_attribs = list(df.select_dtypes(exclude='object').columns)
    num_attribs = [i for i in num_attribs if i not in key_attribs]
    
    cat_attribs = list(df.select_dtypes(include='object').columns)
    cat_attribs = [i for i in cat_attribs if i not in key_attribs]
    
    if verbose == True:
        print("num: ", num_attribs)
        print("cat: ", cat_attribs)

    num_pipeline = Pipeline([
            ('df_imputer', DfImputer('num')),
            ('df_std_scaler', DfScaler()),
        ])

    cat_pipeline = Pipeline([
            ('df_imputer', DfImputer('cat')),
            ('df_ohe', DfOneHotEncoder()),
        ])

    df_data_prep_pipeline = PandasFeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])    

# ColumnTransformer did not work    
#     df_data_prep_pipeline = ColumnTransformer( 
#                                 transformers= [
#                                                 # (name, transformer,     columns)
#                                                 ("num_pipeline", num_pipeline, num_attribs),
# #                                                 ("cat_pipeline", cat_pipeline, cat_attribs),    
#                                               ],
#                                 remainder='drop',
#                                 n_jobs=-1
#                             )
    if verbose == True:
        print("\033[92mSuccess! Pipeline created:\033[00m")
        print(df_data_prep_pipeline)     
    
    return df_data_prep_pipeline

## Execution of pipeline

In [95]:
%%time

# Features selected by looking at the data
selected_features = ['SK_ID_PREV','SK_ID_CURR','NAME_CONTRACT_TYPE','AMT_ANNUITY','AMT_APPLICATION','AMT_CREDIT',
                     'AMT_GOODS_PRICE','WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START',
                     'NAME_CONTRACT_STATUS','DAYS_DECISION','NAME_PORTFOLIO']

# New DF for selected and engineered features
df_selected = DfFeatureAdder(selected_features).fit_transform(df_prev_app)

# Pipeline creation
df_data_prep_pipeline = create_pipelines(df_selected, verbose=False)

# Execute pipe lines on above DF
df_transformed = df_data_prep_pipeline.fit_transform(df_selected)

# Aggregate data on SK_ID_CURR
df_agg = DfAggregator().fit_transform(df_transformed)

display(df_agg.head())

[92m 
DfFeatureAdder.transform: Success! Transformed DF of (1670214, 37) to (1670214, 13) shape.
Feature list: ['SK_ID_PREV', 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'NAME_CONTRACT_STATUS', 'DAYS_DECISION', 'NAME_PORTFOLIO', 'CREDIT_TO_APP_RATIO', 'APPLIED_EXTRA', 'WEEKDAY', 'TIME']
[00m
Using mean on ['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'DAYS_DECISION', 'CREDIT_TO_APP_RATIO']
[92m 
DfImputer.transform: Success! Imputed a DF of (1670214, 13) to (1670214, 5) shape.
Feature list: Index(['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'DAYS_DECISION',
       'CREDIT_TO_APP_RATIO'],
      dtype='object')
[00m
[92m 
DfScaler.transform: Success! Scaled a DF of (1670214, 5) to (1670214, 5) shape.
Feature list: Index(['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'DAYS_DECISION',
       'CREDIT_TO_APP_RATIO'],
      dtype='object')
[00m
Using constant on ['NAME_CONTRACT_TYPE', 'NAME_CONTRACT_STATUS', 'NAME_PORTFOLIO', 'APPLIED_EXT

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,DAYS_DECISION,CREDIT_TO_APP_RATIO,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Consumer loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_XNA,...,APPLIED_EXTRA_0,APPLIED_EXTRA_1,WEEKDAY_END,WEEKDAY_MID,WEEKDAY_START,TIME_AFTERNOON,TIME_EARLY_MORN,TIME_EVENING,TIME_MORN,COUNT_PREV_APP
0,100001,-0.921182,-0.513691,-0.514129,-1.102966,-0.547329,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
1,100002,-0.514407,0.013051,-0.025888,0.35256,-0.228399,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,100003,3.11551,0.888732,0.933614,-0.544629,0.20722,0.333333,0.666667,0.0,0.0,...,1.0,0.0,0.666667,0.333333,0.0,0.666667,0.0,0.0,0.333333,3
3,100004,-0.813268,-0.515582,-0.525704,0.084302,-1.527598,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
4,100005,-0.427509,-0.522321,-0.525796,0.442408,-0.49211,0.5,0.5,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2


Wall time: 23.3 s
