# Create pipeline for processing data

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

X_train = pd.read_csv('../data/orig_train.csv')
target = X_train['DEFAULT_FLAG']
X_train.drop(columns=['DEFAULT_FLAG'], inplace=True)

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Function to create dictionary containing distribution of a categorical column
def get_col_distribution(X, col_name):
    value_counts = X[col_name].value_counts()
    number_of_missing_values = value_counts["Missing"]
    value_counts_dict = value_counts[1:].to_dict()
    
    # change to probabilities
    for key in value_counts_dict:
        value_counts_dict[key] = value_counts_dict[key] / (len(X) - number_of_missing_values)
    
    return value_counts_dict


# Simple column remover to remove low variance columns
class FeatureRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.columns_to_drop,axis=1)

# Update name of the column
class FeatureNameUpdater(BaseEstimator, TransformerMixin):
    def __init__(self, old_name, new_name):
        self.old_name = old_name
        self.new_name = new_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.rename(columns = {self.old_name: self.new_name}, inplace = True)
        
        return X
    
# Imputes missing values with mode and according to distribution of columns
class MyImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_mode_imputation, cols_dist_imputation):
        self.modes = {}
        self.distributions_of_columns = {}
        self.cols_mode_imputation = cols_mode_imputation
        self.cols_dist_imputation = cols_dist_imputation
    
    def fit(self, X, y=None):
        for col_name in self.cols_mode_imputation:
            self.modes[col_name] = X[col_name].mode()[0]
        
        for col_name in self.cols_dist_imputation:
            self.distributions_of_columns[col_name] = get_col_distribution(X, col_name)
        
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        for col_name in self.cols_mode_imputation:
            X_copy[col_name] = X[col_name].replace('Missing', self.modes[col_name])
        
        for col_name in self.cols_dist_imputation:
            column_distribution = self.distributions_of_columns[col_name]
            
            X_copy[col_name] = X[col_name].replace('Missing',
                                              np.random.choice(list(column_distribution.keys()), 
                                                        p = list(column_distribution.values())))
        
        return X_copy

# Encoder for categorical variables handling both ordered and unordered ones
class MyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features_wo_order, cat_feature_with_order, categories_order):
        self.cat_features_wo_order = cat_features_wo_order
        self.cat_feature_with_order = cat_feature_with_order
        self.categories_order = categories_order
        self.ordinal_encoder = OrdinalEncoder(categories=[categories_order])

    def fit(self, X, y=None):
        self.ordinal_encoder.fit(X[[self.cat_feature_with_order]])

        return self

    def transform(self, X, y=None):
        X_copy = pd.get_dummies(X, columns=self.cat_features_wo_order)

        X_copy[self.cat_feature_with_order] = self.ordinal_encoder.transform(X[[self.cat_feature_with_order]])

        return X_copy

# scaling the data, based on choice it will either standarize(default) or normalize,
class MyScaler(BaseEstimator, TransformerMixin):
    def __init__(self, standarize=True):
        self.standarize = standarize
        
        if self.standarize is True:
            self.scaler = StandardScaler()
        else:
            self.scaler = MinMaxScaler()

    def fit(self, X, y=None):
        # we want to scale only numerical columns
        X_num_cols = X.select_dtypes(include=['float64', 'int64'])
        
        self.scaler.fit(X_num_cols)
        
        return self

    def transform(self, X, y=None):
        X_num_cols = X.select_dtypes(include=['float64', 'int64'])
        
        # transform numerical columns
        X_num_cols_transformed = self.scaler.transform(X_num_cols)
        
        # change to df to access columns
        X_num_cols_transformed_df = pd.DataFrame(X_num_cols_transformed, columns=X_num_cols.columns, index=X_num_cols.index)

        # change them in X
        X_copy = X.copy()
        for col_name in X_num_cols_transformed_df.columns:
            X_copy[col_name] = X_num_cols_transformed_df[col_name]

        return X_copy
    

class OutlierReplacer(BaseEstimator, TransformerMixin):
    # It will replace outliers from columns based on provided dictionary(for now list of columns)
    # If to_remove_dict[col_name] is true outliers will be removed (and exchanged with a proper quantile)
    # It is possible to adjust parameter k, by default its set to 1.5 as usually it is done
    def __init__(self, columns=None, k=1.5):
        self.columns = columns
        self.k = k
        self.lower_bounds = {}
        self.upper_bounds = {}
        
    def fit(self, X, y=None):
        # Compute lower and upper bounds for each specified column
        if self.columns is None:
            self.columns = X.columns
        for col in self.columns:
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - self.k * iqr
            upper_bound = q3 + self.k * iqr
            self.lower_bounds[col] = lower_bound
            self.upper_bounds[col] = upper_bound
        return self
    
    def transform(self, X, y=None):
        # Replace outliers in specified columns with calculated bounds
        X_copy = X.copy()
        for col in self.columns:
            lower_bound = self.lower_bounds[col]
            upper_bound = self.upper_bounds[col]
            X_copy[col] = X_copy[col].clip(lower_bound, upper_bound)
        return X_copy
    

In [4]:
from sklearn.pipeline import Pipeline

pipeline_processing = Pipeline([
    # first we remove low variance columns (chosen during EDA) and FINALIZED_LOAN as variable from future
    ('feature_remover', FeatureRemover(['SAVING_ACCOUNT', 'FOREIGN_ACCOUNT', 'DEPOSIT', 'PENSION_FUNDS', 'FINALIZED_LOAN'])), # first 4 columns are removed because of low variance, FINALIZED_LOAN removed as a feature from future
   
    # then we update the name of column BUSINESS AGE to keep it consisten with the rest of columns
    ('name_updater', FeatureNameUpdater('BUSINESS AGE', 'BUSINESS_AGE')),
    
    # we impute missing values with mode or according to distribution (in case when missing value is mode)
    ('imputer', MyImputer(['AREA', 'EDUCATION'], ['ECONOMIC_SECTOR', 'EMPLOYEE_NO'])),
    
    # encoding columns (those in a list) with one-hot encoding as they have no order
    # EMPLOYEE_NO is encoded using OrdinalEncoder with specified order of values
    ('encoder', MyEncoder(["PRODUCT", "AREA", "RESIDENTIAL_PLACE", "EDUCATION", "MARITAL_STATUS",
                           "ECONOMIC_SECTOR"], 'EMPLOYEE_NO', ["between 0-10", "between 11-20",
                                                               "between 21-50", "between 51-100",
                                                               "between 101-250", "between 251-500",
                                                               "between 501-1.000", "> 1.000"])),
    
    # Replace outliers in chosen columns, you can also provide parameter k for the iqr scaling (1.5 default)
    ('outlier_replacer', OutlierReplacer(['LENGTH_RELATIONSHIP_WITH_CLIENT', 'WORK_SENIORITY', 'BUSINESS_AGE', 'INCOME'])),
    
    # scaling numerical data, you can choose whether to standarize(True) or minmax(False) (TODO: fix scaling 0-1 type features)
    ('scaler', MyScaler(standarize=False))
])

X_train_transformed = pipeline_processing.fit_transform(X_train)

From our EDA we remember that some categorical columns had values that appeared very rarely, hence some of the columns after encoding might have near zero variance and should be removed

In [5]:
encoded_columns = X_train_transformed.select_dtypes(include=['bool'])

# dictionary for saving number of true values in particular columns
dict_true_values = {}

for col_name in encoded_columns.columns:
    # calculate number of true values and add to dict
    number_of_true_values = encoded_columns[col_name].sum()
    dict_true_values[col_name] = number_of_true_values

# sort keys by values
keys_sorted = sorted(dict_true_values, key=dict_true_values.get)

for key in keys_sorted:
    print(key, ':', dict_true_values[key], 'out of', len(X_train_transformed))

PRODUCT_D : 13 out of 15097
RESIDENTIAL_PLACE_Rental : 28 out of 15097
EDUCATION_Primary school : 28 out of 15097
PRODUCT_A : 42 out of 15097
ECONOMIC_SECTOR_Real estate activities : 79 out of 15097
ECONOMIC_SECTOR_Electricity and gas : 152 out of 15097
ECONOMIC_SECTOR_Accommodation and food service activities : 159 out of 15097
EDUCATION_Middle school : 195 out of 15097
ECONOMIC_SECTOR_Agriculture, hunting and forestry : 196 out of 15097
ECONOMIC_SECTOR_Mining and quarrying : 205 out of 15097
RESIDENTIAL_PLACE_Other : 246 out of 15097
ECONOMIC_SECTOR_Financial and insurance activities : 274 out of 15097
ECONOMIC_SECTOR_Water supply : 333 out of 15097
ECONOMIC_SECTOR_Professional, scientific and technical activities : 393 out of 15097
ECONOMIC_SECTOR_Education : 460 out of 15097
ECONOMIC_SECTOR_Information and communication : 517 out of 15097
EDUCATION_College : 562 out of 15097
ECONOMIC_SECTOR_Human health and social work activities : 668 out of 15097
ECONOMIC_SECTOR_Transportation an

We can safely delete PRODUCT_D, RESIDENTIAL_PLACE_Rental, EDUCATION_Primary School and PRODUCT_A
(not sure were to set the threshold for removal)

In [6]:
columns_to_drop = ['PRODUCT_D', 'RESIDENTIAL_PLACE_Rental', 'EDUCATION_Primary school', 'PRODUCT_A']
X_train_transformed = X_train_transformed.drop(columns_to_drop, axis=1)

# although we already removed columns we will add this step to pipeline
# so that whole data processing is saved in it
pipeline_processing.steps.append(['encoded_feature_remover', FeatureRemover(columns_to_drop=columns_to_drop)])

# High correlated features

From EDA heat map we know that there were some highly correlated features, let's display them now

In [7]:
def find_correlated_pairs(X, k):
    # Calculate Pearson correlation coefficients
    pearson_corr = X.corr(method='pearson')

    # Calculate Spearman correlation coefficients
    spearman_corr = X.corr(method='spearman')

    # Initialize lists to store correlated pairs
    pearson_correlated_pairs = []
    spearman_correlated_pairs = []
    
    # Initialize set for storing column names
    cols = set()

    # Loop through each pair of features
    for i in range(len(X.columns)):
        for j in range(len(X.columns)):
            if i > j:
                # Check Pearson correlation coefficient
                if abs(pearson_corr.iloc[i, j]) > k:
                    col1, col2 = pearson_corr.columns[i], pearson_corr.columns[j]
                    pearson_correlated_pairs.append((col1, col2, round(pearson_corr.loc[col1, col2], 2)))
                    cols.add(col1)
                    cols.add(col2)
                # Check Spearman correlation coefficient
                if abs(spearman_corr.iloc[i, j]) > k:
                    col1, col2 = spearman_corr.columns[i], pearson_corr.columns[j]
                    spearman_correlated_pairs.append((col1, col2, round(spearman_corr.loc[col1, col2], 2)))
                    cols.add(col1)
                    cols.add(col2)

    return pearson_correlated_pairs, spearman_correlated_pairs, cols


pearson_pairs, spearman_pairs, cols = find_correlated_pairs(X_train_transformed, 0.7)
print("Pearson Correlated Pairs:")
for pair in pearson_pairs:
    print(pair)
print("\nSpearman Correlated Pairs:")
for pair in spearman_pairs:
    print(pair)

Pearson Correlated Pairs:
('NO_OF_DEPENDENTS', 'HOUSEHOLD_MEMBERS', 0.73)
('CURRENT_ACCOUNT', 'DEBIT_CARD', 0.81)
('RESIDENTIAL_PLACE_Owner without mortgage', 'RESIDENTIAL_PLACE_Living with family', -0.85)
('MARITAL_STATUS_married', 'HOUSEHOLD_MEMBERS', 0.78)
('MARITAL_STATUS_single', 'MARITAL_STATUS_married', -0.77)

Spearman Correlated Pairs:
('CURRENT_ACCOUNT', 'DEBIT_CARD', 0.81)
('RESIDENTIAL_PLACE_Owner without mortgage', 'RESIDENTIAL_PLACE_Living with family', -0.85)
('MARITAL_STATUS_married', 'HOUSEHOLD_MEMBERS', 0.88)
('MARITAL_STATUS_single', 'HOUSEHOLD_MEMBERS', -0.71)
('MARITAL_STATUS_single', 'MARITAL_STATUS_married', -0.77)


## Lets check correlation of these features with target

In [14]:
def get_sorted_correlations(X, cols, target):
    # Calculate correlations with target for each column in cols
    correlations = {}
    for col in cols:
        correlation = target.corr(X[col])
        correlations[col] = round(correlation, 2)

    # Sort correlations in descending order
    sorted_correlations = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

    # Print sorted correlations
    print("Correlations with target (sorted in descending order):")
    for col, correlation in sorted_correlations:
        print(f"{col}: {correlation}")
    
    return sorted_correlations

sorted_correlations = get_sorted_correlations(X_train_transformed, cols, target)

Correlations with target (sorted in descending order):
MARITAL_STATUS_single: 0.16
MARITAL_STATUS_married: -0.14
RESIDENTIAL_PLACE_Living with family: 0.12
HOUSEHOLD_MEMBERS: -0.1
RESIDENTIAL_PLACE_Owner without mortgage: -0.09
CURRENT_ACCOUNT: -0.03
DEBIT_CARD: -0.02
NO_OF_DEPENDENTS: 0.01


## Also lets see feature importance in RandomForestClassifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_transformed, target)

# Get feature importances from the classifier
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame and sort by importance
importance_df = pd.DataFrame({'Feature': X_train_transformed.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

                                              Feature  Importance
3                                              INCOME    0.155290
0                                                 AGE    0.130838
5                                        BUSINESS_AGE    0.087166
4                                      WORK_SENIORITY    0.067820
7                     LENGTH_RELATIONSHIP_WITH_CLIENT    0.056225
6                                         EMPLOYEE_NO    0.054252
9                                     CURRENT_ACCOUNT    0.019432
2                                    NO_OF_DEPENDENTS    0.018308
15                                AREA_County capital    0.018191
16                                    AREA_Rural area    0.018139
14                                          PRODUCT_F    0.017673
1                                   HOUSEHOLD_MEMBERS    0.017530
28                               EDUCATION_University    0.017318
23                               EDUCATION_Highschool    0.017079
17        

## Summarize these outcomes

This summarization below contains no new data, it was just created for pracitcal reason to avoid scrolling through cells.

In [25]:
def create_table(X, triple, feature_importances):
    # Extract elements from the triple
    col1, col2, correlation = triple

    # Get correlation with target for each column
    corr_with_target_col1 = round(target.corr(X[col1]), 2)
    corr_with_target_col2 = round(target.corr(X[col2]), 2)

    # Get feature importances for each column
    importance_col1 = feature_importances.loc[feature_importances['Feature'] == col1, 'Importance'].values[0]
    importance_col2 = feature_importances.loc[feature_importances['Feature'] == col2, 'Importance'].values[0]

    # Construct the table as a DataFrame
    table = pd.DataFrame({
        '': ['Correlation with target', 'Feature importance'],
        col1: [corr_with_target_col1, importance_col1],
        col2: [corr_with_target_col2, importance_col2]
    })

    return table



for pair in pearson_pairs:
    print(pair)
    table = create_table(X_train_transformed, pair, importance_df)
    print(table, '\n')

('NO_OF_DEPENDENTS', 'HOUSEHOLD_MEMBERS', 0.73)
                            NO_OF_DEPENDENTS  HOUSEHOLD_MEMBERS
0  Correlation with target          0.010000           -0.10000
1       Feature importance          0.018308            0.01753 

('CURRENT_ACCOUNT', 'DEBIT_CARD', 0.81)
                            CURRENT_ACCOUNT  DEBIT_CARD
0  Correlation with target        -0.030000   -0.020000
1       Feature importance         0.019432    0.015464 

('RESIDENTIAL_PLACE_Owner without mortgage', 'RESIDENTIAL_PLACE_Living with family', -0.85)
                            RESIDENTIAL_PLACE_Owner without mortgage  \
0  Correlation with target                                 -0.090000   
1       Feature importance                                  0.011363   

   RESIDENTIAL_PLACE_Living with family  
0                              0.120000  
1                              0.016366   

('MARITAL_STATUS_married', 'HOUSEHOLD_MEMBERS', 0.78)
                            MARITAL_STATUS_married  HOUSE

In [26]:
### TODO add comments and decide what feature should be removed