In [10]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import math

In [11]:
CURRENT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
RAW_DATA_FOLDER = os.path.join(CURRENT_FOLDER,  "data", "raw")
DATASETS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "datasets")

adult_dataset_path = os.path.join(RAW_DATA_FOLDER, "adult.data")
adult_test_dataset_path = os.path.join(RAW_DATA_FOLDER, "adult.test")

In [12]:
columns = ['age','workclass','fnlwgt','education','education-num','marital-status',
        'occupation','relationship','race','sex','capital-gain', 'capital-loss',
        'hours-per-week', 'native-country','compensation']

adult_data = pd.read_csv(adult_dataset_path, names=columns, sep=", ")
adult_test_data = pd.read_csv(adult_test_dataset_path, names=columns, sep=", ")

  adult_data = pd.read_csv(adult_dataset_path, names=columns, sep=", ")
  adult_test_data = pd.read_csv(adult_test_dataset_path, names=columns, sep=", ")


In [13]:
def clean_adult(data):
    
    # Transform "?" value to NaN value and drop NaN values
    data = data.replace('?', np.nan)
    data.dropna(inplace=True)
    
    # Clean some values
    data['compensation'] = data['compensation'].replace({
        '<=50K.' : '<=50K', 
        '>50K.' : '>50K',
    })

    # Education-column is already categorized in education-num column
    columns_to_drop = ['education', 'fnlwgt']
    data.drop(columns=columns_to_drop, inplace=True)
        
    # Transform string columns to categorical columns and combine some values
    data['age'] = data['age'].astype(int)
    data['workclass'] = pd.Categorical(data['workclass'])
    data['education-num'] = pd.Categorical(data['education-num'])
    data['marital-status'] = pd.Categorical(data['marital-status'])
    data['occupation'] = pd.Categorical(data['occupation'])
    data['relationship'] = pd.Categorical(data['relationship'])
    data['race'] = pd.Categorical(data['race'])
    data['sex'] = pd.Categorical(data['sex'])
    data['capital-gain'] = data['capital-gain'].astype(int)
    data['capital-loss'] = data['capital-loss'].astype(int)
    data['hours-per-week'] = data['hours-per-week'].astype(int)
    data['native-country'] = pd.Categorical(data['native-country'])
    data['compensation'] = pd.Categorical(data['compensation'])
    
    # Create few categorical features
    data['had-capital-gains'] = (data['capital-gain'] > 0).astype(int)
    data['had-capital-losses'] = (data['capital-loss'] > 0).astype(int)

    # Optionally drop capital-gains column
    # columns_to_drop = ['capital-gain', 'capital-loss']
    # data.drop(columns=columns_to_drop, inplace=True)
    
    # Reset index
    data.reset_index(drop=True, inplace=True)
    
    # Move "compensation" column to last
    data = data.reindex(columns=[col for col in data.columns if col != 'compensation'] + ['compensation'])

    return data

In [14]:
cleaned_adult_train = clean_adult(adult_data)
cleaned_adult_test = clean_adult(adult_test_data)

In [15]:
adult_train_path = os.path.join(DATASETS_FOLDER, "cleaned_adult_train_data.csv")
adult_test_path = os.path.join(DATASETS_FOLDER, "cleaned_adult_test_data.csv")

cleaned_adult_train.to_csv(adult_train_path, index=False)
cleaned_adult_test.to_csv(adult_test_path, index=False)