In [1]:
# -----------------------------------------------------------
# Dissertation Project: An Empirical Study on the Classification 
# Performance of Deep Learning vs. Gradient Boosting 
# on heterogeneous tabular data
#
# This notebook provides functions for the preprocessing of the 
# Heloc dataset. 
#
# Author: Adam Mabrouk
# Supervisor: Ben Ralph
# Institution: University of Bath
# Created on: 01/01/2024
# Version: 1.0 

# Acknowledgments:

# The cleaning pipeline is based on the literature (not code)
# of Authors: Shi, S., Tse, R., Luo, W., D’Addona, 
# S. and Pau, G., 2022. Machine learning-driven credit risk: 
# a systemic review. Neural Computing and Applications, 34(17), 
# pp.14327-14339.

# Bonacci, A., Petrillo, P. & Reitano, A., Year of Publication. 
# Credit Risk Prediction - HELOC Case. [pdf] 
# Available at: credit_risk_prediction_heloc_case.pdf [01.06.2023].

# Libraries and versions
# ----------------------
# Python version: 3.11.5 
# numpy: 1.24.3
# pandas: 2.0.3

# Import standard libraries for data handling,
import pandas as pd
import numpy as np

In [2]:
class DataCleaner:
    """This class is the first of two notebooks to clean, then feature engineer (2nd notebook) the Heloc dataset """

    def __init__(self, heloc_file_path):
        """ 
        Args:
        heloc_file_path (str) heloc_dataset_v1.csv """
        self.heloc_file_path = heloc_file_path
        self.data = pd.read_csv(self.heloc_file_path, low_memory=False)
        self.initial_column_count = len(self.data.columns)

    def remove_duplicates(self):
        """ Removes duplicate rows from the dataset."""
        original_rows = len(self.data)
        self.data = self.data.drop_duplicates().reset_index(drop=True)
        duplicates_dropped = original_rows - len(self.data)

    def remove_columns_less_than_30_Percent(self):
        """ This function removes columns with more than 30% of missing values."""
        original_columns = len(self.data.columns)
        self.data = self.data.loc[:, self.data.isnull().mean() < 0.3]
        remaining_columns = len(self.data.columns)
        self.columns_with_missing_data = original_columns - remaining_columns

        """
        excluded_columns = ["AverageMInFile",
                            "NumTrades90Ever2DerogPubRec",
                            "NumTradesOpeninLast12M",
                            "PercentInstallTrades",
                            "NumInqLast6M",
                            "NumInstallTradesWBalance",
                            "NetFractionInstallBurden",
                            "NumRevolvingTradesWBalance",
                            "PercentTradesWBalance",
                            'NumSatisfactoryTrades',
                            "NumBank2NatlTradesWHighUtilization"]
                          """

        """
        chosen_columns = ["ExternalRiskEstimate"                    # Continuous
                          "AverageMInFile"                          # Continuous
                          "PercentTradesNeverDelq"                  # Continuous
                          "PercentInstallTrades"                    # Continuous
                          "NetFractionRevolvingBurden"              # Continuous
                          "NetFractionInstallBurden"                # Continuous
                          "PercentTradesWBalance"                   # Continuous

                          "MSinceOldestTradeOpen"                   # Discrete
                          "MSinceMostRecentTradeOpen"               # Discrete
                          "NumSatisfactoryTrades"                   # Discrete
                          "NumTrades60Ever2DerogPubRec"             # Discrete
                          "NumTrades90Ever2DerogPubRec"             # Discrete
                          "MSinceMostRecentDelq"                    # Discrete
                          "MaxDelq2PublicRecLast12M"                # Discrete
                          "MaxDelqEver"                             # Discrete
                          "NumTotalTrades"                          # Discrete
                          "NumTradesOpeninLast12M"                  # Discrete
                          "MSinceMostRecentInqexcl7days"            # Discrete
                          "NumInqLast6M"                            # Discrete
                          "NumInqLast6Mexcl7days"                   # Discrete
                          "NumRevolvingTradesWBalance"              # Discrete
                          "NumInstallTradesWBalance"                # Discrete
                          "NumBank2NatlTradesWHighUtilization"      # Discrete
                          "MaxDelq2PublicRecLast12M = 37 features"  # Categorical
                          "MaxDelqEver = 35 features"               # Categorical
                          "RiskPerformance"]                        # Target variable"""

    def remove_place_holders(self):
        """Remove place holders -9, -8, -7. In this function the following occurs:
        1. -9: Initially replace with NaN, then later imputed with mean during scaling
        2. -8 for numerical columns: Replace with NaN, then later imputed with mean during scaling
        3. -7 by replacing with the highest number in the column + 1 """
        self.data.replace(-9, np.nan, inplace=True)
        self.data.dropna(inplace=True)

        for col in self.data.select_dtypes(include=['float64', 'int64']).columns:
            self.data[col].replace(-8, np.nan, inplace=True)
            self.data[col].fillna(self.data[col].median(), inplace=True)
            self.data[col].replace(-7, self.data[col].max() + 1, inplace=True)

    def label_mapping(self):
        """This function maps the target label 'RiskPerformance' to 0 and 1, 
        1 for good credit prediction and 0 for bad credit prediction. 
        
        Note: The AUPRC scores for the minority class, are dependent on
        the minority class being either positive or negative, not just minority."""
        self.label_mapping = {'Good': 1, 'Bad': 0,}
        self.data['RiskPerformance'] = self.data['RiskPerformance'].map(self.label_mapping)

    def save_data(self, file_path):
        """
        Args:
            Save updated 'heloc_cleaned.csv' csv file."""
        self.data.to_csv(file_path, index=False)

    def clean_data(self):
        """
        This function enables the user to runs all features, which have been cleaned in the dataset.
        """
        self.remove_duplicates()
        self.remove_columns_less_than_30_Percent()
        self.remove_place_holders()
        self.label_mapping()

cleaned = DataCleaner("raw_datasets/heloc_dataset_v1.csv")
cleaned.clean_data()
cleaned.save_data("cleaned_data/heloc_cleaned.csv")

In [3]:
cleaned.data.head()

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,0,55.0,144.0,4.0,84.0,20.0,3.0,0.0,83.0,2.0,...,43.0,0.0,0.0,0.0,33.0,74.0,8.0,1.0,1.0,69.0
1,0,61.0,58.0,15.0,41.0,2.0,4.0,4.0,100.0,84.0,...,67.0,0.0,0.0,0.0,0.0,74.0,0.0,2.0,1.0,0.0
2,0,67.0,66.0,5.0,24.0,9.0,0.0,0.0,100.0,84.0,...,44.0,0.0,4.0,4.0,53.0,66.0,4.0,2.0,1.0,86.0
3,0,66.0,169.0,1.0,73.0,28.0,1.0,1.0,93.0,76.0,...,57.0,0.0,5.0,4.0,72.0,83.0,6.0,4.0,3.0,91.0
4,0,81.0,333.0,27.0,132.0,12.0,0.0,0.0,100.0,84.0,...,25.0,0.0,1.0,1.0,51.0,89.0,3.0,1.0,0.0,80.0
