In [11]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import math

In [12]:
CURRENT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
RAW_DATA_FOLDER = os.path.join(CURRENT_FOLDER,  "data", "raw")
DATASETS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "datasets")

adult_dataset_path = os.path.join(RAW_DATA_FOLDER, "adult.data")
adult_test_dataset_path = os.path.join(RAW_DATA_FOLDER, "adult.test")

In [13]:
columns = ['age','workclass','fnlwgt','education','education-num','marital-status',
        'occupation','relationship','race','sex','capital-gain', 'capital-loss',
        'hours-per-week', 'native-country','compensation']

adult_data = pd.read_csv(adult_dataset_path, names=columns, sep=", ")
adult_test_data = pd.read_csv(adult_test_dataset_path, names=columns, sep=", ")

  adult_data = pd.read_csv(adult_dataset_path, names=columns, sep=", ")
  adult_test_data = pd.read_csv(adult_test_dataset_path, names=columns, sep=", ")


In [4]:
def clean_adult(data):
    
    # Transform "?" value to NaN value and drop NaN values
    data = data.replace('?', np.nan)
    data.dropna(inplace=True)
    
    # Clean some values
    data['compensation'] = data['compensation'].replace({
        '<=50K.' : '<=50K', 
        '>50K.' : '>50K',
    })
    
    data['compensation'] = data['compensation'].replace({
        '<=50K' : 0, 
        '>50K' : 1,
    })

    # Education-column is already categorized in education-num column
    columns_to_drop = ['education', 'fnlwgt']
    data.drop(columns=columns_to_drop, inplace=True)
        
    # Transform string columns to categorical columns and combine some values
    data['age'] = data['age'].astype(int)
    data['workclass'] = pd.Categorical(data['workclass'])
    data['education-num'] = pd.Categorical(data['education-num'])
    data['marital-status'] = pd.Categorical(data['marital-status'])
    data['occupation'] = pd.Categorical(data['occupation'])
    data['relationship'] = pd.Categorical(data['relationship'])
    data['race'] = pd.Categorical(data['race'])
    data['sex'] = pd.Categorical(data['sex'])
    data['capital-gain'] = data['capital-gain'].astype(int)
    data['capital-loss'] = data['capital-loss'].astype(int)
    data['hours-per-week'] = data['hours-per-week'].astype(int)
    data['native-country'] = pd.Categorical(data['native-country'])
    data['compensation'] = pd.Categorical(data['compensation'])
    
    # Create few categorical features
    data['had-capital-gains'] = (data['capital-gain'] > 0).astype(int)
    data['had-capital-losses'] = (data['capital-loss'] > 0).astype(int)

    # Optionally drop capital-gains column
    # columns_to_drop = ['capital-gain', 'capital-loss']
    # data.drop(columns=columns_to_drop, inplace=True)
    
    # Reset index
    data.reset_index(drop=True, inplace=True)
    
    # Move "compensation" column to last
        data = data.reindex(columns=[col for col in data.columns if col != 'compensation'] + ['compensation'])

    return data

In [6]:
cleaned_adult_train = clean_adult(adult_data)
cleaned_adult_test = clean_adult(adult_test_data)

In [8]:
cleaned_adult_train

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,had-capital-gains,had-capital-losses,compensation
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,1,0,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,0,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,0,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,0,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30157,27,Private,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0,0,0
30158,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,0,0,1
30159,58,Private,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,0,0
30160,22,Private,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0,0,0


In [9]:
cleaned_adult_test

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,had-capital-gains,had-capital-losses,compensation
0,25,Private,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0,0,0
1,38,Private,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0,0,0
2,28,Local-gov,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,0,0,1
3,44,Private,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1,0,1
4,34,Private,6.0,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,Private,13.0,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,0,0,0
15056,39,Private,13.0,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,0,0,0
15057,38,Private,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,0,0,0
15058,44,Private,13.0,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,1,0,0


In [10]:
adult_train_path = os.path.join(DATASETS_FOLDER, "cleaned_adult_train_data.csv")
adult_test_path = os.path.join(DATASETS_FOLDER, "cleaned_adult_test_data.csv")

cleaned_adult_train.to_csv(adult_train_path, index=False)
cleaned_adult_test.to_csv(adult_test_path, index=False)