In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

In [2]:
raw_data = np.genfromtxt('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                         delimiter=', ', dtype=str, invalid_raise=False)

In [3]:
# column names from "https://archive.ics.uci.edu/ml/datasets/Adult"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation',
                'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                'income']

In [4]:
adult_data = pd.DataFrame(raw_data, columns=column_names)

In [5]:
adult_data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [6]:
adult_data.dtypes

age                object
workclass          object
fnlwgt             object
education          object
educational-num    object
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain       object
capital-loss       object
hours-per-week     object
native-country     object
income             object
dtype: object

For more details on how the below transformations are made, please refer to https://rpubs.com/H_Zhu/235617
<br>
https://github.com/interpretml/DiCE/blob/master/dice_ml/utils/helpers.py

In [7]:
# convert age, educational-num and hours-per-week to integers
adult_data = adult_data.astype({"age": np.int64, "educational-num": np.int64, "hours-per-week": np.int64})

In [8]:
adult_data.dtypes

age                 int64
workclass          object
fnlwgt             object
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain       object
capital-loss       object
hours-per-week      int64
native-country     object
income             object
dtype: object

In [9]:
# replace values in workclass column
#adult_data = adult_data.replace({'workclass': {'Without-pay': 'Other/Unknown', 'Never-worked': 'Other/Unknown'}})
adult_data = adult_data.replace({'workclass': {'Federal-gov': 'Government', 'State-gov': 'Government',
                                 'Local-gov': 'Government'}})
adult_data = adult_data.replace({'workclass': {'Self-emp-not-inc': 'Self-Employed', 'Self-emp-inc': 'Self-Employed'}})
adult_data = adult_data.replace({'workclass': {'Never-worked': 'Other/Unknown', 'Without-pay': 'Other/Unknown'}})
adult_data = adult_data.replace({'workclass': {'?': 'Other/Unknown'}})

In [10]:
# replace values in occupation column
adult_data = adult_data.replace(
    {
        'occupation': {
            'Adm-clerical': 'White-Collar', 'Craft-repair': 'Blue-Collar',
            'Exec-managerial': 'White-Collar', 'Farming-fishing': 'Blue-Collar',
            'Handlers-cleaners': 'Blue-Collar',
            'Machine-op-inspct': 'Blue-Collar', 'Other-service': 'Service',
            'Priv-house-serv': 'Service',
            'Prof-specialty': 'Professional', 'Protective-serv': 'Service',
            'Tech-support': 'Service',
            'Transport-moving': 'Blue-Collar', 'Unknown': 'Other/Unknown',
            'Armed-Forces': 'Other/Unknown', '?': 'Other/Unknown'
        }
    }
)

In [11]:
# replace values in marital-status column
adult_data = adult_data.replace({'marital-status': {'Married-civ-spouse': 'Married', 'Married-AF-spouse': 'Married',
                                                    'Married-spouse-absent': 'Married', 'Never-married': 'Single'}})

In [12]:
# replace values in race column
adult_data = adult_data.replace({'race': {'Black': 'Other', 'Asian-Pac-Islander': 'Other',
                                          'Amer-Indian-Eskimo': 'Other'}})

In [13]:
# drop unnecessary columns
adult_data = adult_data[['age', 'workclass', 'education', 'marital-status', 'occupation',
                         'race', 'gender', 'hours-per-week', 'income']]

In [14]:
# change target variable to binary(0 or 1)
adult_data = adult_data.replace({'income': {'<=50K': 0, '>50K': 1}})

In [15]:
# replace values in education column
adult_data = adult_data.replace({'education': {'Assoc-voc': 'Assoc', 'Assoc-acdm': 'Assoc',
                                               '11th': 'School', '10th': 'School', '7th-8th': 'School',
                                               '9th': 'School', '12th': 'School', '5th-6th': 'School',
                                               '1st-4th': 'School', 'Preschool': 'School'}})

In [16]:
# rename some column names
adult_data = adult_data.rename(columns={'marital-status': 'marital_status', 'hours-per-week': 'hours_per_week'})

In [17]:
adult_data.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,39,Government,Bachelors,Single,White-Collar,White,Male,40,0
1,50,Self-Employed,Bachelors,Married,White-Collar,White,Male,13,0
2,38,Private,HS-grad,Divorced,Blue-Collar,White,Male,40,0
3,53,Private,School,Married,Blue-Collar,Other,Male,40,0
4,28,Private,Bachelors,Married,Professional,Other,Female,40,0


In [18]:
len(adult_data)

32561

In [24]:
bool_series = adult_data.duplicated()
bool_series

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557     True
32558    False
32559     True
32560    False
Length: 32561, dtype: bool

In [21]:
adult_data.iloc[32557,:]

age                        40
workclass             Private
education             HS-grad
marital_status        Married
occupation        Blue-Collar
race                    White
gender                   Male
hours_per_week             40
income                      1
Name: 32557, dtype: object

In [22]:
adult_data.iloc[32559,:]

age                         22
workclass              Private
education              HS-grad
marital_status          Single
occupation        White-Collar
race                     White
gender                    Male
hours_per_week              20
income                       0
Name: 32559, dtype: object

In [19]:
# check for duplicate rows
adult_data_duplicate = adult_data[adult_data.duplicated()]
len(adult_data_duplicate)

9057

In [20]:
adult_data_duplicate

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
170,27,Private,HS-grad,Single,Service,White,Male,40,0
352,33,Private,Bachelors,Married,White-Collar,White,Male,40,1
391,27,Private,Bachelors,Single,Blue-Collar,White,Male,50,0
455,40,Private,Assoc,Married,Blue-Collar,Other,Male,40,1
463,50,Private,School,Married,Blue-Collar,White,Male,40,0
...,...,...,...,...,...,...,...,...,...
32550,43,Self-Employed,Some-college,Married,Blue-Collar,White,Male,50,0
32551,32,Private,School,Married,Blue-Collar,Other,Male,40,0
32555,22,Private,Some-college,Single,Service,White,Male,40,0
32557,40,Private,HS-grad,Married,Blue-Collar,White,Male,40,1


In [19]:
adult_data.to_csv('../datasets/adult_income.csv', index=False)

In [22]:
adult_data_dup = adult_data.drop_duplicates()

In [23]:
len(adult_data_dup)

23504

In [24]:
adult_data_dup.to_csv('../datasets/adult_income_no_duplicate.csv', index=False)