In [1]:
# -----------------------------------------------------------
# Dissertation Project: An Empirical Study on the Classification 
# Performance of Deep Learning vs. Gradient Boosting 
# on heterogeneous tabular data
#
# This notebook provides functions for the preprocessing of the 
# lending Club dataset. 
#
# Author: Adam Mabrouk
# Supervisor: Ben Ralph
# Institution: University of Bath
# Created on: 01/01/2024
# Version: 1.0 

# Acknowledgments:

# The cleaning pipeline is based on the literature (not code)
# of Authors: Shi, S., Tse, R., Luo, W., Dâ€™Addona, 
# S. and Pau, G., 2022. Machine learning-driven credit risk: 
# a systemic review. Neural Computing and Applications, 34(17), 
# pp.14327-14339.

# Malekipirbazari, M. and Aksakalli, V., 2015. 
# Risk assessment in social lending via random forests. 
# Expert Systems with Applications, 42(10), pp.4621-4631.

# Demajo, L.M., Vella, V. and Dingli, A., 2020. 
# Explainable ai for interpretable credit scoring. 
# arXiv preprint arXiv:2012.03749.
# # -----------------------------------------------------------
# Libraries and versions
# ----------------------
# Python version: 3.11.5 
# numpy: 1.24.3
# pandas: 2.0.3

# Import standard libraries for data handling,  
from datetime import datetime # filtering dataset. 
import pandas as pd
import numpy as np

In [None]:
class DataCleaner:
    """The DataCleaner class cleans the Lending Club dataset. """

    def __init__(self, lending_club_file_path):
        """ Constructor reads Lending Club file path.
        Args:
            lending_club_file_path 'accepted_2007_to_2018Q4.csv' (str)"""

        self.lending_club_file_path = lending_club_file_path
        self.data = pd.read_csv(self.lending_club_file_path, low_memory=False)
        self.load_and_filter_data()
        self.initial_column_count = len(self.data.columns)

    def load_and_filter_data(self):
        """ This function filters the Lending Club dataset to years: 2012-01-01 - 2016-01-01. These years 
        represent more economoic stability, post 2007 recession and prior to the COVID pandemic. """

        if 'issue_d' in self.data.columns:
            self.data['issue_d'] = pd.to_datetime(self.data['issue_d'], format='%b-%Y')
            start_date = pd.Timestamp('2012-01-01')
            end_date = pd.Timestamp('2016-01-01') # changed from '2016-09-30' to '2016-01-01'
            self.data = self.data.loc[(self.data['issue_d'] >= start_date) & (self.data['issue_d'] <= end_date)]

    def remove_duplicates(self):
        """ This function removes duplicate rows from the dataset. """

        original_rows = len(self.data)
        self.data = self.data.drop_duplicates().reset_index(drop=True)
        duplicates_dropped = original_rows - len(self.data)

    def remove_columns_less_than_30_Percent(self):
        """ This function removes columns with more than 30% of missing values."""

        original_columns = len(self.data.columns)
        self.data = self.data.loc[:, self.data.isnull().mean() < 0.3]
        remaining_columns = len(self.data.columns)
        self.columns_with_missing_data = original_columns - remaining_columns

    def select_features(self):
        """ This function selects specific features from the dataset that contribute to the
        credit risk prediction model. As the focus of this project is tabular data, a detailed 
        note is made on the type of features, which is shown below."""

        chosen_columns = ["annual_inc",    # Continuous
                          "dti",           # Continuous
                          "installment",   # Continuous
                          "int_rate",      # Continuous
                          "loan_amnt",     # Continuous
                          "revol_bal",     # Continuous
                          "revol_util",    # Continuous
                          "open_acc",      # Discrete
                          "delinq_2yrs",   # Discrete
                          "total_acc",     # Discrete
                          "emp_length",    # Categorical
                          "grade",         # Categorical
                          "home_ownership",# Categorical
                          "purpose",       # Categorical
                          "sub_grade",     # Categorical
                          "term",          # Categorical
                          "loan_status"]   # Categorical """Target"""

        self.data = self.data[chosen_columns]

    def adjust_variables(self):
        """
        This function adjusts the categorical variable home_ownership by 
        filtering out rows where 'home_ownership' is 'NONE', 'OTHER', or 'ANY'.
        """
        self.data = self.data[~self.data["home_ownership"].isin(['NONE', 'OTHER', 'ANY'])]

    def adjust_dependent_variable(self):
        """ This function drops terms from the loan status feature that are not
        specific to loan default (bad credit risk)."""

        self.data = self.data[~self.data['loan_status'].isin(['Current', 'In Grace Period', 'Late (16-30 days)'])]
        self.data.dropna(inplace=True)

    def select_dependent_variable_values(self):
        """ This function selects rows with appropriate terms that are specific to loan default (bad credit risk) and
        good credit risk for the loan status feature. """

        self.data = self.data[self.data['loan_status'].isin(['Fully Paid', 'Charged Off', 'Default', 'Late (31-120 days)'])]

    def y_class_mapping(self):
        """
        This function sets dependent variable (labels) to 1 for good loans, 0 for bad loans
        using manual binary encoding from multiple labels. All classification models are binary

        'Fully Paid' loans are considered as good credit risk and encoded as 1.
        'Charged Off', 'Default', 'Late (31-120 days)' loans are considered as bad credit risk and encoded as 0.
        """

        self.label_column = 'loan_status'
        self.label_mapping = {
            'Fully Paid': 1,
            'Charged Off': 0,
            'Late (31-120 days)': 0,
            'Default': 0}

        self.data[self.label_column] = self.data[self.label_column].map(self.label_mapping)

    def save_data(self, file_path):
        """ Save the cleaned data to a CSV file.
        Args:
            file_path (str): cleaned Lending Club data saved as LendingClubDataCleaner.csv"""
        self.data.to_csv(file_path, index=False)

    def cleaned_data(self):
        """
        This function enables the user to runs all cleaned features in the dataset.
        """
        self.remove_duplicates()
        self.remove_columns_less_than_30_Percent()
        self.select_features()
        self.adjust_variables()
        self.adjust_dependent_variable()
        self.select_dependent_variable_values()
        self.y_class_mapping()

cleaned = DataCleaner("raw_datasets/01_LendingClubData_2007_to_2018Q4.csv")
cleaned.cleaned_data()
cleaned.save_data("cleaned_data/lending_club_cleaned.csv")

In [None]:
cleaned.data.head()