# Create pipeline

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

X = pd.read_csv('../data/orig_train.csv')

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin

def get_col_distribution(X, col_name):
    value_counts = X[col_name].value_counts()
    number_of_missing_values = value_counts["Missing"]
    value_counts_dict = value_counts[1:].to_dict()
    
    # change to probabilities
    for key in value_counts_dict:
        value_counts_dict[key] = value_counts_dict[key] / (len(X) - number_of_missing_values)
    
    return value_counts_dict


# Simple column remover to remove low variance columns that are chosen a priori
class FeatureRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.columns_to_drop,axis=1)


class FeatureNameUpdater(BaseEstimator, TransformerMixin):
    def __init__(self, old_name, new_name):
        self.old_name = old_name
        self.new_name = new_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.rename(columns = {self.old_name: self.new_name}, inplace = True)
        
        return X
    
# Imputes missing values with mode and according to distribution of columns
class MyImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_mode_imputation, cols_dist_imputation):
        self.modes = {}
        self.distributions_of_columns = {}
        self.cols_mode_imputation = cols_mode_imputation
        self.cols_dist_imputation = cols_dist_imputation
    
    def fit(self, X, y=None):
        for col_name in self.cols_mode_imputation:
            self.modes[col_name] = X[col_name].mode()[0]
        
        for col_name in self.cols_dist_imputation:
            self.distributions_of_columns[col_name] = get_col_distribution(X, col_name)
        
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        for col_name in self.cols_mode_imputation:
            X_copy[col_name] = X[col_name].replace('Missing', self.modes[col_name])
        
        for col_name in self.cols_dist_imputation:
            column_distribution = self.distributions_of_columns[col_name]
            
            X_copy[col_name] = X[col_name].replace('Missing',
                                              np.random.choice(list(column_distribution.keys()), 
                                                        p = list(column_distribution.values())))
        
        return X_copy
    

class MyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features_wo_order, cat_feature_with_order, categories_order):
        self.cat_features_wo_order = cat_features_wo_order
        self.cat_feature_with_order = cat_feature_with_order
        self.ordinal_encoder = OrdinalEncoder(categories=[categories_order])
        
    def fit(self, X, y=None):
        self.ordinal_encoder.fit(X[[self.cat_feature_with_order]])
        
        return self
        
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy = pd.get_dummies(X, columns=self.cat_features_wo_order)
        
        X_copy[self.cat_feature_with_order] = self.ordinal_encoder.transform(X[[self.cat_feature_with_order]])
        
        return X_copy
        

In [41]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('feature_remover', FeatureRemover(['SAVING_ACCOUNT', 'FOREIGN_ACCOUNT', 'DEPOSIT', 'PENSION_FUNDS'])),
    ('name_updater', FeatureNameUpdater('BUSINESS AGE', 'BUSINESS_AGE')),
    ('imputer', MyImputer(['AREA', 'EDUCATION'], ['ECONOMIC_SECTOR', 'EMPLOYEE_NO'])),
    ('encoder', MyEncoder(["PRODUCT", "AREA", "RESIDENTIAL_PLACE", "EDUCATION", "MARITAL_STATUS",
                           "ECONOMIC_SECTOR"], 'EMPLOYEE_NO', ["between 0-10", "between 11-20",
                                                               "between 21-50", "between 51-100",
                                                               "between 101-250", "between 251-500",
                                                               "between 501-1.000", "> 1.000"]))
])

pipeline.fit_transform(X)

Unnamed: 0,AGE,HOUSEHOLD_MEMBERS,NO_OF_DEPENDENTS,INCOME,WORK_SENIORITY,BUSINESS_AGE,EMPLOYEE_NO,LENGTH_RELATIONSHIP_WITH_CLIENT,DEBIT_CARD,CURRENT_ACCOUNT,...,ECONOMIC_SECTOR_Information and communication,ECONOMIC_SECTOR_Manufacturing,ECONOMIC_SECTOR_Mining and quarrying,ECONOMIC_SECTOR_Other,"ECONOMIC_SECTOR_Professional, scientific and technical activities",ECONOMIC_SECTOR_Public administration and defence,ECONOMIC_SECTOR_Real estate activities,ECONOMIC_SECTOR_Transportation and storage,ECONOMIC_SECTOR_Water supply,ECONOMIC_SECTOR_Wholesale and retail trade
0,65,2,0,1245.0,5,16,2.0,1,0,0,...,False,False,False,False,False,False,False,False,False,True
1,64,2,0,1380.0,5,16,2.0,8,0,0,...,False,False,False,False,False,False,False,False,False,True
2,30,2,0,1131.0,2,6,7.0,1,1,1,...,False,False,False,True,False,False,False,False,False,False
3,39,1,0,1730.0,9,13,1.0,2,0,0,...,False,False,False,False,False,False,False,False,False,False
4,38,1,0,1189.0,5,16,2.0,1,1,1,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15092,31,2,0,1886.0,8,17,7.0,1,1,1,...,False,False,False,False,False,False,False,True,False,False
15093,54,1,0,670.0,36,45,4.0,9,0,0,...,False,False,False,False,False,False,False,False,False,True
15094,45,1,0,4794.0,11,21,7.0,11,0,0,...,False,False,False,False,False,True,False,False,False,False
15095,57,2,0,3733.6,11,21,5.0,1,1,1,...,False,False,False,False,False,False,False,False,False,False
