In [1]:
# -----------------------------------------------------------
# Dissertation Project: An Empirical Study on the Classification 
# Performance of Deep Learning vs. Gradient Boosting 
# on heterogeneous tabular data
#
# This notebook provides functions for the preprocessing of the 
# lending Club dataset. 
#
# Author: Adam Mabrouk
# Supervisor: Ben Ralph
# Institution: University of Bath
# Created on: 01/01/2024
# Version: 1.0 

# Acknowledgments:

# The cleaning pipeline is based on the literature (not code)
# of Authors: Shi, S., Tse, R., Luo, W., D’Addona, 
# S. and Pau, G., 2022. Machine learning-driven credit risk: 
# a systemic review. Neural Computing and Applications, 34(17), 
# pp.14327-14339.

# Malekipirbazari, M. and Aksakalli, V., 2015. 
# Risk assessment in social lending via random forests. 
# Expert Systems with Applications, 42(10), pp.4621-4631.

# Hatchondo, J.C., Martinez, L. and Sánchez, J.M., 2015. Mortgage defaults. 
# Journal of Monetary Economics, 76, pp.173-190.
# # -----------------------------------------------------------
# Libraries and versions
# ----------------------
# Python version: 3.11.5 
# numpy: 1.24.3
# pandas: 2.0.3

# Import standard libraries for data handling,   
import pandas as pd
import numpy as np

In [2]:
# Numerical embeddings for a later ecperiment

class FeatureEngineer:
    """In the feature engineering class, specific features are processed individually,
    adapting methods in the aformentioned papers above that are specific to the financial 
    domain"""
    def __init__(self, lending_club_file_path):
        """
        Args:
        lending_club_file_path (str).
        """
        self.lending_club_file_path = lending_club_file_path
        self.data = pd.read_csv(self.lending_club_file_path)

    def employment_length(self):
        """ This function removes strings, and rows with NaN values, 
        maps employment length from 0 to 10 (years)."""

        self.data['emp_length'] = self.data['emp_length'].astype(str)
        self.data['emp_length'] = self.data['emp_length'].replace({'< 1 year': '0', '10+ years': '10', ' years': ''}, regex=True)
        self.data['emp_length'] = pd.to_numeric(self.data['emp_length'], errors='coerce')
        self.data['emp_length'] = self.data['emp_length'].apply(lambda x: x if x <= 10 else 10)
        self.data.dropna(subset=['emp_length'], inplace=True)

    def home_ownership(self, apply_ordinal=False):
        """ This function maps the feature home_ownership to contain only 3 values: 'Rent', 'Own', 'Mortgage'.
        These values are then 'optionally' converted to numerical value ("RENT": 0, "MORTGAGE": 1, "OWN": 2) to represent an
        ordinal relationship based on the apply_ordinal flag. The choice is optional for the user depending on which type of 
        encoding technicque they wish to adopt, and for deep learning models, whether they want to apply embeddings after.
        In this example, ordinal is set to False as embeddings will be applied to this feature and ordinal is not determined, 
        although owning your own home suggests you are less likely to go into default (Hatchondo et al., 2015). """

        keep_entries = ['RENT', 'OWN', 'MORTGAGE']
        self.data = self.data[self.data['home_ownership'].isin(keep_entries)]

        # Apply ordinal mapping - either, True or False mentioned above
        if apply_ordinal:
            ordinal_mapping = {'RENT': 0, 'MORTGAGE': 1, 'OWN': 2}
            self.data['home_ownership'] = self.data['home_ownership'].map(ordinal_mapping) 

    def sub_grade_numbers(self):

        """ This function converts the 'sub_grade' feature to numeric values from 1-35 and drops the grade feature."""

        if 'sub_grade' not in self.data.columns:
            raise ValueError("Column 'sub_grade' does not exist")

        # Define the mapping
        sub_grade_map = {}
        sub_grades = ['1', '2', '3', '4', '5']
        grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

        num = 35
        for grade in grades:
            for sub_grade in sub_grades:
                sub_grade_map[grade + sub_grade] = num
                num -= 1

        self.data['sub_grade'] = self.data['sub_grade'].map(sub_grade_map)

        self.data.rename(columns={'sub_grade': 'sub_grade_numbers'}, inplace=True)

        if 'grade' in self.data.columns:
            self.data = self.data.drop('grade', axis=1)

    def purpose(self):
        """ This function, organises the instances in the loan 'purpose' column, using 4 categories mentioned below
        'debt_consolidation', 'credit_card', 'home_improvement', and 'other'."""
        self.data['purpose'] = np.where(
            self.data['purpose'].isin(
                ['debt_consolidation', 'credit_card', 'home_improvement']), self.data['purpose'], 'other')

    def install_to_month_income(self):

        """This feature was made to represent the ratio of the borrowers monthly
        repayments to their resepctive monthly income."""
        self.data['install_to_month_income'] = (self.data['installment'] / (self.data['annual_inc'] / 12)).round(2)
        self.data = self.data[self.data['install_to_month_income'] <= 1]

    def rev_bal_to_income(self):

        """ This feature was made to represent the borrowers revolving credit balance to
        their monthly income. """
        self.data['rev_bal_to_income'] = (self.data['revol_bal'] / (self.data['annual_inc'] / 12)).round(2)

    def loan_to_year_income(self):

        """ This feature represents the ratio of the borrowers loan amount to the
        borrowers annual income. """
        self.data['loan_to_year_income'] = (self.data['loan_amnt'] / self.data['annual_inc']).round(2)
        self.data = self.data[self.data['loan_to_year_income'] <= 3]

    def delinq_2yrs(self):

        """ This function sets a threshold at 20 days for delinquencies, removing values above 20. """
        mask = self.data['delinq_2yrs'] <= 20
        self.data = self.data[mask]

    def annual_income(self):
        """ This function sets a threshold of 2,000,000 for annual income, given the sum of money and previous 
        literature for processing this dataset, values outside this range are considered outliers."""
        self.data = self.data[self.data['annual_inc'] <= 2000000]

    def dti(self):
        """ This function sets a threshold of 100 (%) for the debt to
        income ratio, again, values outside of this range are treated as outliers. """

        dti_col = 'dti' 
        self.data = self.data[(self.data[dti_col] >= 0) & (self.data[dti_col] <= 100)]

    def revol_util(self):
        """ This function sets the revolving utility feature
        to  a threshold of 100 (%), values outside of this
        range are treated as outliers """

        self.data = self.data[self.data['revol_util'] <= 100]

    def save_data(self, file_path):
        """ 
        Args:
        file_path (str) cleaned and updated lending club csv file 'lending_club_feature_engineered.csv', 
        ready to be fed into the data loader (notebook 07.) """
        self.data.to_csv(file_path, index=False)

    def features_complete(self):
        """
        This function enables the user to runs all feature engineering methods which have been used in the dataset.
        """
        self.employment_length()
        self.home_ownership() 
        self.sub_grade_numbers()
        self.purpose() 
        self.install_to_month_income()
        self.rev_bal_to_income()
        self.loan_to_year_income()
        self.delinq_2yrs()
        self.annual_income()
        self.dti()
        self.revol_util()

feature = FeatureEngineer("cleaned_data/lending_club_cleaned.csv")
feature.features_complete()
feature.save_data("feature_engineered_model_data/lending_club_feature_engineered.csv")

In [3]:
feature.data.head()

Unnamed: 0,annual_inc,dti,installment,int_rate,loan_amnt,revol_bal,revol_util,open_acc,delinq_2yrs,total_acc,emp_length,home_ownership,purpose,sub_grade_numbers,term,loan_status,install_to_month_income,rev_bal_to_income,loan_to_year_income
0,55000.0,5.91,123.03,13.99,3600.0,2765.0,29.7,7.0,0.0,13.0,10.0,MORTGAGE,debt_consolidation,22,36 months,1,0.03,0.6,0.07
1,65000.0,16.06,820.28,11.99,24700.0,21470.0,19.2,22.0,1.0,38.0,10.0,MORTGAGE,other,25,36 months,1,0.15,3.96,0.38
2,63000.0,10.78,432.66,10.78,20000.0,7869.0,56.2,6.0,0.0,18.0,10.0,MORTGAGE,home_improvement,27,60 months,1,0.08,1.5,0.32
3,104433.0,25.37,289.91,22.45,10400.0,21929.0,64.5,12.0,1.0,35.0,3.0,MORTGAGE,other,10,60 months,1,0.03,2.52,0.1
4,34000.0,10.2,405.18,13.44,11950.0,8822.0,68.4,5.0,0.0,6.0,4.0,RENT,debt_consolidation,23,36 months,1,0.14,3.11,0.35


#### Below is a closer analysis of the 19 features, including target variable after processing. 
In summary 3 additional features were made in this notebook.
1. 'install_to_month_income'
2. 'rev_bal_to_income'
3. 'loan_to_year_income'

In [4]:
feature.data['emp_length'].value_counts().sort_index().to_frame().T


emp_length,0.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0
count,63488,71772,63509,47301,50587,39193,40944,40773,31891,320677


In [5]:
feature.data['home_ownership'].value_counts().sort_index().to_frame().T


home_ownership,MORTGAGE,OWN,RENT
count,385506,73353,311276


In [6]:
feature.data['sub_grade_numbers'].value_counts().sort_index().to_frame().T


sub_grade_numbers,1,2,3,4,5,6,7,8,9,10,...,26,27,28,29,30,31,32,33,34,35
count,432,514,816,1171,1554,2211,2897,3728,4498,5962,...,42869,48820,49422,44123,40466,40253,30697,21519,20920,22024


In [7]:
feature.data['purpose'].value_counts().sort_index().to_frame().T


purpose,credit_card,debt_consolidation,home_improvement,other
count,181656,459873,43729,84877


In [8]:
feature.data['install_to_month_income'].value_counts().sort_index().to_frame().T


install_to_month_income,0.00,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,...,0.28,0.29,0.30,0.32,0.34,0.35,0.41,0.43,0.45,0.48
count,576,14667,33399,51704,65937,74123,76820,74483,69163,60781,...,7,2,4,2,1,2,1,2,1,1


In [9]:
feature.data['rev_bal_to_income'].value_counts().sort_index().to_frame().T


rev_bal_to_income,0.00,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,...,72.32,73.04,75.97,77.82,79.15,79.20,83.98,87.15,95.47,110.30
count,2549,865,711,685,691,646,656,708,701,727,...,1,1,1,1,1,1,1,1,1,1


In [10]:
feature.data['loan_to_year_income'].value_counts().sort_index().to_frame().T


loan_to_year_income,0.00,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,...,0.88,0.90,0.92,0.95,0.96,1.00,1.07,1.13,1.26,1.52
count,19,1033,4646,7035,9809,12435,14822,16273,20367,20091,...,1,1,1,1,1,3,1,1,1,1


In [11]:
feature.data['delinq_2yrs'].value_counts().sort_index().to_frame().T


delinq_2yrs,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0
count,618374,100537,29943,10740,4803,2463,1345,714,433,247,...,98,82,57,43,26,17,9,12,5,3


In [12]:
feature.data['annual_inc'].value_counts().sort_index().to_frame().T


annual_inc,3800.0,4000.0,5000.0,5360.0,5400.0,5674.0,6000.0,6400.0,6500.0,7000.0,...,1510000.0,1600000.0,1650000.0,1700000.0,1750000.0,1800000.0,1848400.0,1900000.0,1950000.0,2000000.0
count,1,1,2,1,1,1,7,1,1,4,...,1,1,1,2,1,2,1,1,1,5


In [13]:
feature.data['dti'].value_counts().sort_index().to_frame().T


dti,0.00,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,...,63.64,64.99,66.95,67.64,68.41,69.35,83.40,83.64,87.11,90.00
count,202,6,6,4,2,2,6,8,4,2,...,1,1,1,1,1,1,1,1,1,1


In [14]:
feature.data['revol_util'].value_counts().sort_index().to_frame().T


revol_util,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,99.1,99.2,99.3,99.4,99.5,99.6,99.7,99.8,99.9,100.0
count,2326,309,241,242,214,216,172,194,181,158,...,293,247,270,252,235,231,207,204,218,249


In [15]:
feature.data['installment'].value_counts().sort_index().to_frame().T


installment,4.93,14.01,14.77,21.62,23.26,23.36,25.81,25.86,27.85,28.75,...,1404.95,1406.08,1406.45,1407.01,1408.13,1409.28,1409.99,1424.57,1445.46,1447.54
count,1,1,1,1,1,1,1,1,1,1,...,1,1,9,7,5,1,3,2,2,1


In [16]:
feature.data['int_rate'].value_counts().sort_index().to_frame().T


int_rate,5.32,5.93,6.00,6.03,6.24,6.39,6.49,6.62,6.68,6.89,...,26.06,26.77,26.99,27.31,27.49,27.88,27.99,28.14,28.49,28.99
count,10382,1729,236,9912,7051,2482,6200,4817,2387,6633,...,380,364,58,296,45,186,31,1,138,107


In [17]:
feature.data['loan_amnt'].value_counts().sort_index().to_frame().T


loan_amnt,1000.0,1025.0,1050.0,1075.0,1100.0,1125.0,1150.0,1175.0,1200.0,1225.0,...,34775.0,34800.0,34825.0,34850.0,34875.0,34900.0,34925.0,34950.0,34975.0,35000.0
count,2077,6,15,10,61,18,17,12,1080,4,...,6,66,8,9,56,9,5,17,31,31961


In [18]:
feature.data['revol_bal'].value_counts().sort_index().to_frame().T


revol_bal,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,1030826.0,1039903.0,1043860.0,1190046.0,1298783.0,1743266.0,1746716.0,2560703.0,2568995.0,2904836.0
count,1853,31,38,40,31,32,32,29,30,26,...,1,1,1,1,1,1,1,1,1,1


In [19]:
feature.data['open_acc'].value_counts().sort_index().to_frame().T


open_acc,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,67.0,68.0,70.0,74.0,75.0,76.0,79.0,82.0,84.0,90.0
count,142,1682,6866,18317,32314,46507,57453,65784,69499,68378,...,1,1,1,2,1,2,1,1,1,1


In [20]:
feature.data['total_acc'].value_counts().sort_index().to_frame().T


total_acc,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,...,135.0,137.0,138.0,140.0,146.0,150.0,151.0,156.0,162.0,169.0
count,20,214,2139,3618,5442,7698,9892,12229,14614,16815,...,1,1,1,1,1,1,1,1,1,1


In [21]:
feature.data['term'].value_counts().sort_index().to_frame().T


term,36 months,60 months
count,572638,197497


In [22]:
feature.data['loan_status'].value_counts().sort_index().to_frame().T


loan_status,0,1
count,142768,627367
