In [1]:
# -----------------------------------------------------------
# Dissertation Project: An Empirical Study on the Classification 
# Performance of Deep Learning vs. Gradient Boosting 
# on heterogeneous tabular data
#
# This notebook provides functions for the preprocessing of the 
# Adult Income dataset. 
#
# Author: Adam Mabrouk
# Supervisor: Ben Ralph
# Institution: University of Bath
# Created on: 01/01/2024
# Version: 1.0 

# Libraries and versions
# ----------------------
# Python version: 3.11.5 
# numpy: 1.24.3
# pandas: 2.0.3

# Import standard libraries for data handling,
import pandas as pd
import numpy as np

In [2]:
class DataCleaner:
    """A class to clean the income evaluation dataset."""

    def __init__(self, income_evaluation_file_path):
        """ 
            Args:
                income_evaluation_file_path (str)"""
        self.income_evaluation_file_path = income_evaluation_file_path
        self.data = pd.read_csv(self.income_evaluation_file_path, low_memory=False)
        

        """ The first 6 rows below are Continuous variables.
            
                        "age"             
                        "fnlwgt"          # Varibles stands for 'Final weight'. The number of people the census believes the entry represents
                                            This variabls is dropped as it does not contribute to the model's performance
                        "education-num"   # The number of people in varying levels of education.
                        "capital-gain"    # Income from investment sources, apart from wages/salary
                        "capital-loss"    # Losses from investment sources, apart from wages/salary
                        "hours-per-week"  # Number of hours worked per week

                        The 8 variables below are ordinal categorical attributes

                        "workclass"       # The working class (e.g., State-gov, Self-emp-not-inc)
                        "education"       # The highest level of education achieved
                        "marital-status"  
                        "occupation"      
                        "relationship"    
                        "race"            
                        "sex"             
                        "native-country"    """

    def remove_spaces_in_columns_and_values(self):
        """ This function removes the white spaces from the column names and values in the dataset."""
        self.data.rename(columns=lambda x: x.strip().replace(' ', '_'), inplace=True)
        for col in self.data.columns:
            if self.data[col].dtype == 'object':
                self.data[col] = self.data[col].str.strip()

    def drop_columns(self):
        """This function is used to drop the columns "fnlwgt", "education", 
        Education-num provides more informed information for the model"""
        self.exclude_features = ["fnlwgt", "education"]
        self.data.drop(columns=self.exclude_features, inplace=True)

    def drop_value_in_columns(self):
        "This function drops the instance 'Armed-Forces' which has one row."
        self.data = self.data[self.data['occupation'] != 'Armed-Forces']

    def removing_placeholders(self):
        """This function removes place holders in the variables below, which 
        need to be removed as they negatively affect the model"""
        self.data['workclass'] = self.data['workclass'].replace('?', 'Unknown')
        self.data['occupation'] = self.data['occupation'].replace('?', 'Unknown')
        self.data['native-country'] = self.data['native-country'].replace('?', 'Unknown')

    def set_binary_value(self):
        """The function below maps the target variable to binary output"""
        self.label_column = 'income'
        self.label_mapping = {'>50K': 1, '<=50K': 0,}
        self.data[self.label_column] = self.data[self.label_column].map(self.label_mapping)

    def save_data(self, income_evaluation_file_path):
        """Args:
            file_path (str) "income_evaluation_feature_engineered.csv" """
        self.data.to_csv(income_evaluation_file_path, index=False)

    def clean_data(self):
        """
        This function enables the user to runs all feature methods used in the dataset
        Please see below for a summary of each feature.
        """
        self.remove_spaces_in_columns_and_values()
        self.drop_columns()
        self.drop_value_in_columns()
        self.removing_placeholders()
        self.set_binary_value()

clean = DataCleaner("raw_datasets/income_evaluation.csv")
clean.clean_data()
clean.save_data("feature_engineered_model_data/income_evaluation_feature_engineered.csv")

In [3]:
clean.data.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


#### Below is a closer analysis of the all features, including target variable and additional features of interest after processing. 
In summary apart from standard cleaning to the data, the columns 'Education' and 'fnlwgt ' were removed.

In [4]:
clean.data["age"].value_counts().sort_index().to_frame().T

age,17,18,19,20,21,22,23,24,25,26,...,80,81,82,83,84,85,86,87,88,90
count,395,550,712,753,720,765,875,796,841,785,...,22,20,12,6,10,3,1,1,3,43


In [5]:
clean.data["workclass"].value_counts().sort_index().to_frame().T

workclass,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Unknown,Without-pay
count,951,2093,7,22696,1116,2541,1298,1836,14


In [6]:
clean.data["education-num"].value_counts().sort_index().to_frame().T

education-num,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
count,51,168,333,646,514,933,1175,432,10497,7289,1382,1067,5354,1722,576,413


In [7]:
clean.data["marital-status"].value_counts().sort_index().to_frame().T

marital-status,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,Separated,Widowed
count,4443,23,14973,418,10677,1025,993


In [8]:
clean.data["occupation"].value_counts().sort_index().to_frame().T

occupation,Adm-clerical,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,Unknown
count,3770,4099,4066,994,1370,2002,3295,149,4140,649,3650,928,1597,1843


In [9]:
clean.data["relationship"].value_counts().sort_index().to_frame().T

relationship,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
count,13191,8301,979,5067,3446,1568


In [10]:
clean.data["race"].value_counts().sort_index().to_frame().T

race,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
count,310,1039,3123,271,27809


In [11]:
clean.data["sex"].value_counts().sort_index().to_frame().T

sex,Female,Male
count,10771,21781


In [12]:
clean.data["capital-gain"].value_counts().sort_index().to_frame().T

capital-gain,0,114,401,594,914,991,1055,1086,1111,1151,...,15831,18481,20051,22040,25124,25236,27828,34095,41310,99999
count,29840,6,2,34,8,5,25,4,1,8,...,6,2,37,1,4,11,34,5,2,159


In [13]:
clean.data["capital-loss"].value_counts().sort_index().to_frame().T

capital-loss,0,155,213,323,419,625,653,810,880,974,...,2547,2559,2603,2754,2824,3004,3683,3770,3900,4356
count,31034,1,4,3,3,12,3,2,6,2,...,4,12,5,2,10,2,2,2,2,3


In [14]:
clean.data["hours-per-week"].value_counts().sort_index().to_frame().T

hours-per-week,1,2,3,4,5,6,7,8,9,10,...,89,90,91,92,94,95,96,97,98,99
count,20,32,39,54,60,64,26,144,18,278,...,2,29,3,1,1,2,5,2,11,85


In [15]:
clean.data["native-country"].value_counts().sort_index().to_frame().T

native-country,Cambodia,Canada,China,Columbia,Cuba,Dominican-Republic,Ecuador,El-Salvador,England,France,...,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Unknown,Vietnam,Yugoslavia
count,19,121,75,59,95,70,28,106,90,29,...,114,12,80,51,18,19,29161,583,67,16


In [16]:
clean.data["income"].value_counts().sort_index().to_frame().T

income,0,1
count,24712,7840
