In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Preprocess Adult

Predict whether income exceeds $50K/yr based on census data

Data from [here](https://archive.ics.uci.edu/ml/datasets/adult).

Listing of attributes:


- age
- workclass: Private, Federal-gov, Without-pay, Never-worked...
- fnlwgt:  "The continuous variable fnlwgt represents final weight, which is the number of units in the target population that the responding unit represents."
- education: Bachelors, Some-college, 11th, HS-grad...
- education-num: continuous.
- marital-status: Married-civ-spouse, Divorced, Never-married...
- occupation: Tech-support, Craft-repair, Other-service, Sales...
- relationship: Wife, Own-child, Husband, Not-in-family...
- race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
- sex: Female, Male.
- capital-gain: continuous.
- capital-loss: continuous.
- hours-per-week: continuous.
- native-country: United-States, Cambodia, England..

In [55]:
def entropy(p1, p2):
    return -(p1 * np.log2(p1) + p2 * np.log2(p2))

entropy(0.1, 0.9), entropy(0.3, 0.7), entropy(0.99, 0.01), 

(0.4689955935892812, 0.8812908992306927, 0.08079313589591118)

## Final preprocess function

In [27]:
def clean_income(df):
    col_names = [
        "age",
        "workclass",
        "final_weight",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
        "native_country",
        "income_class",
    ]
    
    return (df
            .rename(columns={idx:col for idx, col in enumerate(col_names)})
            .drop(columns=['final_weight', 'education_num'])
            .astype({col: 'category' for col in ['workclass', 'education', 'marital_status', 
                                                 'occupation', 'relationship', 'race', 'sex',
                                                 'native_country', 'income_class']})
            .astype({'age': 'uint8', 'hours_per_week': 'uint8'})
           )

In [31]:
income = pd.read_csv("adult.data", header=None)
cleaned_income = clean_income(income)

income.memory_usage(deep=True).sum(), cleaned_income.memory_usage(deep=True).sum()

(21141576, 889603)

## Explore

In [17]:
income = pd.read_csv("adult.data", header=None)
col_names = [
    "age",
    "workclass",
    "final_weight",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_class",
]

In [19]:
(income
 .rename(columns={idx:col for idx, col in enumerate(col_names)})
 .drop(columns=['final_weight', 'education_num'])
 .dtypes
)

age                int64
workclass         object
education         object
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income_class      object
dtype: object

In [24]:
(income
 .rename(columns={idx:col for idx, col in enumerate(col_names)})
 .drop(columns=['final_weight', 'education_num'])
 .select_dtypes('object')
 .astype({col: 'category' for col in ['workclass', 'education', 'marital_status', 
                                      'occupation', 'relationship', 'race', 'sex',
                                      'native_country', 'income_class']})
)

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income_class
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


In [25]:
(income
 .rename(columns={idx:col for idx, col in enumerate(col_names)})
 .drop(columns=['final_weight', 'education_num'])
 .select_dtypes('integer')
 .describe()
)

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0
mean,38.581647,1077.648844,87.30383,40.437456
std,13.640433,7385.292085,402.960219,12.347429
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [26]:
(income
 .rename(columns={idx:col for idx, col in enumerate(col_names)})
 .drop(columns=['final_weight', 'education_num'])
 .astype({'age': 'uint8', 'hours_per_week': 'uint8'})
)

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_class
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
