In [1]:
# -----------------------------------------------------------
# Dissertation Project: An Empirical Study on the Classification 
# Performance of Deep Learning vs. Gradient Boosting 
# on heterogeneous tabular data
#
# This module provides functions for data-preprocessing before the 
# data is fed into the models NODE, TabNet, FFNN, and XGBoost. 
#
# Author: Adam Mabrouk
# Supervisor: Ben Ralph
# Institution: University of Bath
# Created on: 01/01/2024
# Version: 1.0 
# -----------------------------------------------------------

In [2]:
# Libraries and versions
# ----------------------
# Python version: 3.11.5 
# numpy: 1.24.3
# pandas: 2.0.3
# imbalanced-learn: 0.12.0
# scikit-learn: 1.4.0
# tensorflow: 2.15.0


# Imports for data handling and visualisation 
import numpy as np
import pandas as pd
import os

from Tabular_loader_class import DataSet, DataLoader, DataPreprocessor


2024-01-27 16:55:16.909601: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Lending Club
lending_club = DataSet(file_path="feature_engineered_model_data/lending_club_feature_engineered.csv", 
                       
                       y_label="loan_status",

                       categorical_columns = ['purpose',
                                              'home_ownership',
                                              'term',
                                              'sub_grade_numbers'],

                       excluded_columns = [],
                       output_path='lending_club_model_data',

                       subset_size = 8000, 
                       sub_sample_first=True,
                       train_split_ratio = 1 - 2 * (1_000/770_000),
                       use_undersampling=False,
                       use_oversampling=True,
                       encoding_type='label',
                       use_embeddings=True)

# HELOC: No balancer applied and due to the size of the datasets no subset_size needed.
heloc = DataSet(file_path="feature_engineered_model_data/heloc_feature_engineered.csv",

                y_label='RiskPerformance',
                
                excluded_columns= ["AverageMInFile",
                                  "NumTrades90Ever2DerogPubRec",
                                  "NumTradesOpeninLast12M",
                                  "PercentInstallTrades",
                                  "NumInqLast6M",
                                  "NumInstallTradesWBalance",
                                  "NetFractionInstallBurden",
                                  "NumRevolvingTradesWBalance",
                                  "PercentTradesWBalance",
                                  'NumSatisfactoryTrades',
                                  "NumBank2NatlTradesWHighUtilization"],

                # encode
                categorical_columns= ["MaxDelqEver",
                                      "MaxDelq2PublicRecLast12M"],
                
                output_path='heloc_model_data',
                encoding_type='label',
                train_split_ratio = 0.8,
                use_embeddings=True)

# CREDIT DEFAULT
credit_default = DataSet(file_path="feature_engineered_model_data/credit_default_feature_engineered.csv",
                         y_label='default_payment_next_month',
                         categorical_columns= ['SEX', 'EDUCATION', 'MARRIAGE'],
                         output_path='credit_default_model_data',
                         train_split_ratio = 0.8,
                         use_undersampling=False,
                         use_oversampling=True,
                         encoding_type='label',
                         use_embeddings=True)

# ADULT INCOME
adult_income = DataSet(file_path="feature_engineered_model_data/income_evaluation_feature_engineered.csv",
                       y_label='income',
                       
                        categorical_columns=["workclass",
                                            "marital-status",
                                            "occupation",
                                            "relationship",
                                            "race",
                                            "sex",
                                            "native-country"],

                       output_path="income_evaluation_model_data",
                       train_split_ratio = 0.8,
                       use_undersampling = False,
                       use_oversampling=True,
                       encoding_type='label',
                       use_embeddings=True)

for dataset in [lending_club, heloc, credit_default, adult_income]:
    data_loader = DataLoader(dataset.file_path,
                             y_label=dataset.y_label,
                             train_split_ratio=dataset.train_split_ratio,
                             subset_size=dataset.subset_size,
                             categorical_columns=dataset.categorical_columns,
                             excluded_columns = dataset.excluded_columns,
                             use_undersampling=dataset.use_undersampling,
                             use_oversampling=dataset.use_oversampling,
                             sub_sample_first=dataset.sub_sample_first)
    
    X_train, X_val, X_test, y_train, y_val, y_test = data_loader.get_data()

    data_processor = DataProcessor(use_embeddings=dataset.use_embeddings,
                                   categorical_columns = dataset.categorical_columns,
                                   encoding_type=dataset.encoding_type)
    
    data_processor.fit(X_train)
    
    X_train = data_processor.transform(X_train)
    X_val = data_processor.transform(X_val)
    X_test = data_processor.transform(X_test)
    
    folder_name = dataset.output_path
    os.makedirs(folder_name, exist_ok=True)

    X_train_path = os.path.join(folder_name, 'X_train.csv')
    X_val_path = os.path.join(folder_name, 'X_val.csv')
    X_test_path = os.path.join(folder_name, 'X_test.csv')
    y_train_path = os.path.join(folder_name, 'y_train.csv')
    y_val_path = os.path.join(folder_name, 'y_val.csv')
    y_test_path = os.path.join(folder_name, 'y_test.csv')

    X_train.copy().to_csv(X_train_path, index=False)
    X_val.copy().to_csv(X_val_path, index=False)
    X_test.copy().to_csv(X_test_path, index=False)
    y_train.copy().to_csv(y_train_path, index=False)
    y_val.copy().to_csv(y_val_path, index=False)
    y_test.copy().to_csv(y_test_path, index=False)