In [1]:
# -----------------------------------------------------------
# Dissertation Project: An Empirical Study on the Classification 
# Performance of Deep Learning vs. Gradient Boosting 
# on heterogeneous tabular data
#
# This notebook provides functions for the preprocessing of the 
# Heloc dataset. 
#
# Author: Adam Mabrouk
# Supervisor: Ben Ralph
# Institution: University of Bath
# Created on: 01/01/2024
# Version: 1.0 

# Acknowledgments:

# The cleaning pipeline is based on the literature (not code)
# of Authors: Shi, S., Tse, R., Luo, W., D’Addona, 
# S. and Pau, G., 2022. Machine learning-driven credit risk: 
# a systemic review. Neural Computing and Applications, 34(17), 
# pp.14327-14339.

# Bonacci, A., Petrillo, P. & Reitano, A., Year of Publication. 
# Credit Risk Prediction - HELOC Case. [pdf] 
# Available at: credit_risk_prediction_heloc_case.pdf [01.06.2023].

# Libraries and versions
# ----------------------
# Python version: 3.11.5 
# numpy: 1.24.3
# pandas: 2.0.3

# Import standard libraries for data handling,
import pandas as pd
import numpy as np

In [2]:
class FeatureEngineer():
    """The feature engineering class for the Heloc dataset is based on the literature by Bonacci et al. (2023).
    Below shows 6 feature adaptions from the authors literature (not code provided, literature only)."""

    def __init__(self, heloc_file_path):
        """
        Args:
            heloc_file_path 'heloc_cleaned.csv' (str).
        """
        self.heloc_file_path = heloc_file_path
        self.data = pd.read_csv(self.heloc_file_path, low_memory=False)
        self.initial_column_count = len(self.data.columns)

    def percenttradesneverdelq(self):
        """
        Bins and encodes the 'PercentTradesNeverDelq' feature into categorical ranges.
        """
        bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        labels = ['0-10%', '11-20%', '21-30%', '31-40%', '41-50%', '51-60%', '61-70%', '71-80%', '81-90%', '91-100%']
        self.data["PercentTradesNeverDelq"] = pd.cut(self.data["PercentTradesNeverDelq"], bins=bins, labels=labels, include_lowest=True)
        mapping = {'0-10%': 0, '11-20%': 1, '21-30%': 2, '31-40%': 3, '41-50%': 4, '51-60%': 5, '61-70%': 6, '71-80%': 7, '81-90%': 8, '91-100%': 9}
        self.data["PercentTradesNeverDelq"] = self.data["PercentTradesNeverDelq"].map(mapping)

    def netfractionrevolvingburden(self):
        """
        Categorizes 'NetFractionRevolvingBurden' into levels of financial burden: low, medium, and high.
        """
        bins = [0, 40, 80, np.inf]
        labels = ['0-40_Low', '41-80_Medium', 'MoreThan_80_High']
        self.data["NetFractionRevolvingBurden"] = pd.cut(self.data["NetFractionRevolvingBurden"], bins=bins, labels=labels, include_lowest=True)
        mapping = {'0-40_Low': 0, '41-80_Medium': 1, 'MoreThan_80_High': 2}
        self.data["NetFractionRevolvingBurden"] = self.data["NetFractionRevolvingBurden"].map(mapping)

    def msincemostrecentdelq(self):
        """
        Bins the 'MSinceMostRecentDelq' feature into monthly periods.
        """
        bins = [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84]
        labels = ['0-6_Months', '7-12_Months', '13-18_Months',
                  '19-24_Months', '25-30_Months', '31-36_Months',
                  '37-42_Months', '43-48_Months', '49-54_Months',
                  '55-60_Months', '61-66_Months', '67-72_Months', '73-78_Months', '79-84_Months']
        
        self.data["MSinceMostRecentDelq"] = pd.cut(self.data["MSinceMostRecentDelq"], bins=bins, labels=labels, include_lowest=True)
        
        mapping = {'0-6_Months': 0, '7-12_Months': 1, '13-18_Months': 2, '19-24_Months': 3, '25-30_Months': 4,
                    '31-36_Months': 5, '37-42_Months': 6, '43-48_Months': 7, '49-54_Months': 8, '55-60_Months': 9,
                    '61-66_Months': 10, '67-72_Months': 11, '73-78_Months': 12, '79-84_Months': 13}
        self.data["MSinceMostRecentDelq"] = self.data["MSinceMostRecentDelq"].map(mapping)

    def externalriskestimate(self):
        """
        Categorizes the 'ExternalRiskEstimate' feature into 4 risk levels.
        """
        bins = [0, 60, 70, 80, 100]
        labels = ['0-61_HighRisk', '62-73_MediumRisk', '74-86_LowRisk', '87-100_VeryLowRisk']
        self.data["ExternalRiskEstimate"] = pd.cut(self.data["ExternalRiskEstimate"], bins=bins, labels=labels, include_lowest=True)
        mapping = {'0-61_HighRisk': 0, '62-73_MediumRisk': 1, '74-86_LowRisk': 2, '87-100_VeryLowRisk': 3}
        self.data["ExternalRiskEstimate"] = self.data["ExternalRiskEstimate"].map(mapping)

    def maxdelq2publicreclast12m(self):
        """
        Bins and labels 'MaxDelq2PublicRecLast12M' into categories based on 
        3 types of delinquency behavior (mentioned below).
        """
        bins = [0, 4, 6, 7]  
        labels = ['Bad Behavior', 'Unknown Delinquency', 'Never Delinquent']
        self.data['MaxDelq2PublicRecLast12M'] = pd.cut(self.data['MaxDelq2PublicRecLast12M'], bins=bins, labels=labels, include_lowest=True)

        mapping = {'Bad Behavior': 0, 'Unknown Delinquency': 1, 'Never Delinquent': 2}
        self.data['MaxDelq2PublicRecLast12M'] = self.data['MaxDelq2PublicRecLast12M'].map(mapping)

    def maxdelqever(self):
        """
        Categorizes the 'MaxDelqEver' feature into bins reflecting delinquency status,
        9 is excluded in this method, as it has an ambiguios meaning, 8 is never delinquent.
        """
        bins = [2, 6, 7, 8]  
        labels = ['Bad_Behavior', 'Unknown_Delinquency', 'Never_Delinquent']
        self.data['MaxDelqEver'] = pd.cut(self.data['MaxDelqEver'], bins=bins, labels=labels, include_lowest=True)

        mapping = {'Bad_Behavior': 0, 'Unknown_Delinquency': 1, 'Never_Delinquent': 2}
        self.data['MaxDelqEver'] = self.data['MaxDelqEver'].map(mapping)
        self.data = self.data.dropna()

    def save_data(self, file_path):
        """Args:
            file_path (str): Updated Heloc feature engineered dataset 'heloc_feature_engineered.csv' """
        self.data.to_csv(file_path, index=False)

    def features_complete(self):
        """
        This function enables the user to runs all feature engineering methods which have been used in the dataset.
        """
        self.percenttradesneverdelq()
        self.netfractionrevolvingburden()
        self.msincemostrecentdelq()
        self.externalriskestimate()
        self.maxdelq2publicreclast12m()
        self.maxdelqever()

feature = FeatureEngineer("cleaned_data/heloc_cleaned.csv")
feature.features_complete()
feature.save_data("feature_engineered_model_data/heloc_feature_engineered.csv")

In [3]:
feature.data.head()

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,0,0,144.0,4.0,84.0,20.0,3.0,0.0,8,0,...,43.0,0.0,0.0,0.0,0,74.0,8.0,1.0,1.0,69.0
1,0,1,58.0,15.0,41.0,2.0,4.0,4.0,9,13,...,67.0,0.0,0.0,0.0,0,74.0,0.0,2.0,1.0,0.0
2,0,1,66.0,5.0,24.0,9.0,0.0,0.0,9,13,...,44.0,0.0,4.0,4.0,1,66.0,4.0,2.0,1.0,86.0
3,0,1,169.0,1.0,73.0,28.0,1.0,1.0,9,12,...,57.0,0.0,5.0,4.0,1,83.0,6.0,4.0,3.0,91.0
4,0,3,333.0,27.0,132.0,12.0,0.0,0.0,9,13,...,25.0,0.0,1.0,1.0,1,89.0,3.0,1.0,0.0,80.0


#### Below is a closer analysis of the 6 features, including target variable and additional features of interest after processing. 
In summary 6 features were adjusted in this notebook.
1. percenttradesneverdelq
2. netfractionrevolvingburden
3. self.msincemostrecentdelq
4. externalriskestimate
5. maxdelq2publicreclast12m
6. maxdelqever

In [4]:
feature.data['PercentTradesNeverDelq'].value_counts().sort_index().to_frame().T


PercentTradesNeverDelq,0,1,2,3,4,5,6,7,8,9
count,6,5,17,33,104,123,298,714,1465,7093


In [5]:
feature.data['NetFractionRevolvingBurden'].value_counts().sort_index().to_frame().T


NetFractionRevolvingBurden,0,1,2
count,6090,2892,876


In [6]:
feature.data['MSinceMostRecentDelq'].value_counts().sort_index().to_frame().T


MSinceMostRecentDelq,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,1706,774,538,430,358,265,248,189,159,145,133,125,82,4706


In [7]:
feature.data['ExternalRiskEstimate'].value_counts().sort_index().to_frame().T


ExternalRiskEstimate,0,1,2,3
count,1313,3235,3000,2310


In [8]:
feature.data['MaxDelq2PublicRecLast12M'].value_counts().sort_index().to_frame().T


MaxDelq2PublicRecLast12M,0,1,2
count,2275,3360,4223


In [9]:
feature.data['MaxDelqEver'].value_counts().sort_index().to_frame().T


MaxDelqEver,0,1,2
count,5202,132,4524


In [10]:
feature.data['RiskPerformance'].value_counts().sort_index().to_frame().T


RiskPerformance,0,1
count,5126,4732


In [11]:
feature.data['MSinceOldestTradeOpen'].value_counts().sort_index().to_frame().T


MSinceOldestTradeOpen,2.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,566.0,584.0,586.0,589.0,590.0,598.0,603.0,604.0,789.0,803.0
count,1,1,1,3,3,5,2,6,6,7,...,1,1,1,2,1,1,1,1,1,1


In [12]:
feature.data['MSinceMostRecentTradeOpen'].value_counts().sort_index().to_frame().T


MSinceMostRecentTradeOpen,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,138.0,143.0,145.0,152.0,156.0,163.0,178.0,184.0,207.0,227.0
count,105,732,1160,1015,885,754,659,575,463,414,...,1,2,2,1,2,1,1,1,1,1


In [13]:
feature.data['NumTrades60Ever2DerogPubRec'].value_counts().sort_index().to_frame().T


NumTrades60Ever2DerogPubRec,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,16.0,17.0,19.0
count,6796,1799,691,244,148,72,37,30,11,8,8,4,3,2,2,1,1,1


In [14]:
feature.data['MSinceMostRecentInqexcl7days'].value_counts().sort_index().to_frame().T


MSinceMostRecentInqexcl7days,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0
count,5050,614,377,308,268,191,150,139,112,99,...,49,42,34,45,31,32,35,32,8,1852


In [15]:
feature.data['NumInqLast6Mexcl7days'].value_counts().sort_index().to_frame().T


NumInqLast6Mexcl7days,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,16.0,17.0,18.0,19.0,20.0,21.0,24.0,29.0,46.0,66.0
count,4073,2527,1448,758,452,232,132,71,59,27,...,4,1,2,2,1,1,2,1,1,1
