# Create pipeline

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

X = pd.read_csv('../data/orig_train.csv')
target = X['DEFAULT_FLAG']
X.drop(columns=['DEFAULT_FLAG'], inplace=True)

In [59]:
from sklearn.base import BaseEstimator, TransformerMixin

# Function to create dictionary containing distribution of a categorical column
def get_col_distribution(X, col_name):
    value_counts = X[col_name].value_counts()
    number_of_missing_values = value_counts["Missing"]
    value_counts_dict = value_counts[1:].to_dict()
    
    # change to probabilities
    for key in value_counts_dict:
        value_counts_dict[key] = value_counts_dict[key] / (len(X) - number_of_missing_values)
    
    return value_counts_dict


# Simple column remover to remove low variance columns
class FeatureRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.columns_to_drop,axis=1)

# Update name of the column
class FeatureNameUpdater(BaseEstimator, TransformerMixin):
    def __init__(self, old_name, new_name):
        self.old_name = old_name
        self.new_name = new_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.rename(columns = {self.old_name: self.new_name}, inplace = True)
        
        return X
    
# Imputes missing values with mode and according to distribution of columns
class MyImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_mode_imputation, cols_dist_imputation):
        self.modes = {}
        self.distributions_of_columns = {}
        self.cols_mode_imputation = cols_mode_imputation
        self.cols_dist_imputation = cols_dist_imputation
    
    def fit(self, X, y=None):
        for col_name in self.cols_mode_imputation:
            self.modes[col_name] = X[col_name].mode()[0]
        
        for col_name in self.cols_dist_imputation:
            self.distributions_of_columns[col_name] = get_col_distribution(X, col_name)
        
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        for col_name in self.cols_mode_imputation:
            X_copy[col_name] = X[col_name].replace('Missing', self.modes[col_name])
        
        for col_name in self.cols_dist_imputation:
            column_distribution = self.distributions_of_columns[col_name]
            
            X_copy[col_name] = X[col_name].replace('Missing',
                                              np.random.choice(list(column_distribution.keys()), 
                                                        p = list(column_distribution.values())))
        
        return X_copy

# Encoder for categorical variables handling both ordered and unordered ones
class MyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features_wo_order, cat_feature_with_order, categories_order):
        self.cat_features_wo_order = cat_features_wo_order
        self.cat_feature_with_order = cat_feature_with_order
        self.categories_order = categories_order
        self.ordinal_encoder = OrdinalEncoder(categories=[categories_order])

    def fit(self, X, y=None):
        self.ordinal_encoder.fit(X[[self.cat_feature_with_order]])

        return self

    def transform(self, X, y=None):
        X_copy = pd.get_dummies(X, columns=self.cat_features_wo_order)

        X_copy[self.cat_feature_with_order] = self.ordinal_encoder.transform(X[[self.cat_feature_with_order]])

        return X_copy
        
# to be done
class OutlierRemover(BaseEstimator, TransformerMixin):
    # Remover will remove outliers from columns based on provided dictionary
    # If to_remove_dict[col_name] is true outliers will be removed (and exchanged with a proper quantile)
    def __init__(self, to_remove_dict):
        self.to_remove_dict = to_remove_dict
    
    def fit(self, X, y=None):
        pass
    

In [60]:
from sklearn.pipeline import Pipeline

pipeline_processing = Pipeline([
    # first we remove low variance columns (chosen during EDA) and FINALIZED_LOAN as variable from future
    ('feature_remover', FeatureRemover(['SAVING_ACCOUNT', 'FOREIGN_ACCOUNT', 'DEPOSIT', 'PENSION_FUNDS', 'FINALIZED_LOAN'])), # first 4 columns are removed because of low variance, FINALIZED_LOAN removed as a feature from future
   
    # then we update the name of column BUSINESS AGE to keep it consisten with the rest of columns
    ('name_updater', FeatureNameUpdater('BUSINESS AGE', 'BUSINESS_AGE')),
    
    # we impute missing values with mode or according to distribution (in case when missing value is mode)
    ('imputer', MyImputer(['AREA', 'EDUCATION'], ['ECONOMIC_SECTOR', 'EMPLOYEE_NO'])),
    
    # encoding columns (those in a list) with one-hot encoding as they have no order
    # EMPLOYEE_NO is encoded using OrdinalEncoder with specified order of values
    ('encoder', MyEncoder(["PRODUCT", "AREA", "RESIDENTIAL_PLACE", "EDUCATION", "MARITAL_STATUS",
                           "ECONOMIC_SECTOR"], 'EMPLOYEE_NO', ["between 0-10", "between 11-20",
                                                               "between 21-50", "between 51-100",
                                                               "between 101-250", "between 251-500",
                                                               "between 501-1.000", "> 1.000"]))
])

X_transformed = pipeline_processing.fit_transform(X)

From our EDA we remember that some categorical columns had values that appeared very rarely, hence some of the columns after encoding might have near zero variance and should be removed

In [61]:
encoded_columns = X_transformed.select_dtypes(include=['bool'])

# dictionary for saving number of true values in particular columns
dict_true_values = {}

for col_name in encoded_columns.columns:
    # calculate number of true values and add to dict
    number_of_true_values = encoded_columns[col_name].sum()
    dict_true_values[col_name] = number_of_true_values

# sort keys by values
keys_sorted = sorted(dict_true_values, key=dict_true_values.get)

for key in keys_sorted:
    print(key, ':', dict_true_values[key], 'out of', len(X_transformed))

PRODUCT_D : 13 out of 15097
RESIDENTIAL_PLACE_Rental : 28 out of 15097
EDUCATION_Primary school : 28 out of 15097
PRODUCT_A : 42 out of 15097
ECONOMIC_SECTOR_Real estate activities : 79 out of 15097
ECONOMIC_SECTOR_Electricity and gas : 152 out of 15097
ECONOMIC_SECTOR_Accommodation and food service activities : 159 out of 15097
EDUCATION_Middle school : 195 out of 15097
ECONOMIC_SECTOR_Agriculture, hunting and forestry : 196 out of 15097
ECONOMIC_SECTOR_Mining and quarrying : 205 out of 15097
RESIDENTIAL_PLACE_Other : 246 out of 15097
ECONOMIC_SECTOR_Financial and insurance activities : 274 out of 15097
ECONOMIC_SECTOR_Water supply : 333 out of 15097
ECONOMIC_SECTOR_Construction : 358 out of 15097
ECONOMIC_SECTOR_Professional, scientific and technical activities : 393 out of 15097
ECONOMIC_SECTOR_Education : 460 out of 15097
ECONOMIC_SECTOR_Information and communication : 517 out of 15097
EDUCATION_College : 562 out of 15097
ECONOMIC_SECTOR_Human health and social work activities : 66

We can safely delete PRODUCT_D, RESIDENTIAL_PLACE_Rental, EDUCATION_Primary School and PRODUCT_A
(not sure were to set the threshold for removal)

In [64]:
columns_to_drop = ['PRODUCT_D', 'RESIDENTIAL_PLACE_Rental', 'EDUCATION_Primary school', 'PRODUCT_A']
X_transformed = X_transformed.drop(columns_to_drop, axis=1)

# although we already removed columns we will add this step to pipeline
# so that whole data processing is saved in it
pipeline_processing.steps.append(['encoded_feature_remover', FeatureRemover(columns_to_drop=columns_to_drop)])

In [65]:
X_transformed.head().T

Unnamed: 0,0,1,2,3,4
AGE,65,64,30,39,38
HOUSEHOLD_MEMBERS,2,2,2,1,1
NO_OF_DEPENDENTS,0,0,0,0,0
INCOME,1245.0,1380.0,1131.0,1730.0,1189.0
WORK_SENIORITY,5,5,2,9,5
BUSINESS_AGE,16,16,6,13,16
EMPLOYEE_NO,0.0,0.0,7.0,1.0,0.0
LENGTH_RELATIONSHIP_WITH_CLIENT,1,8,1,2,1
DEBIT_CARD,0,0,1,0,1
CURRENT_ACCOUNT,0,0,1,0,1
