In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
import time

census_original = pd.read_csv("data/census-income.csv")
census = census_original.copy()
#census.info()

In [2]:
def feature_selection(data):
    features_to_remove = ['industry code','occupation code','race','hispanic origin','sex',
                          'region of previous residence','state of previous residence',
                          'detailed household summary in household','migration code-change in msa',
                          'migration code-change in reg','migration code-move within reg',
                          'migration prev res in sunbelt','family members under 18','citizenship',
                          'fill inc questionnaire for veterans admin']
    data.drop(features_to_remove,inplace=True,axis=1)
    return data

In [3]:
def fill_in_missing_values(data):
    missing_vals_columns = ['class of worker','enroll in edu inst last wk','major occupation code',
                            'member of a labor union','reason for unemployment']
    for col in missing_vals_columns:
        data[col] = data[col].fillna("Not In Universe")
    return data

In [4]:
def reduce_education(education_data):
    education_class = []
    for item in education_data:
        class_ = str(item).lower()
        if class_ == "high school graduate":
            education_class.append("High School Graduate")
        elif class_ == "some college but no degree" or class_ == "associates degree-occup /vocational" or class_ == "associates degree-academic program":
            education_class.append("College/ Associates Degree")
        elif class_ == "bachelors degree(ba ab bs)":
            education_class.append("Bachelors Degree")
        elif class_ == "masters degree(ma ms meng med msw mba)" or class_ == "prof school degree (md dds dvm llb jd)" or class_ == "doctorate degree(phd edd)":
            education_class.append("Post-graduates Degree")
        elif class_ == "children":
            education_class.append("Children")
        else:
            education_class.append("Other")
    return education_class

def reduce_marital_status(marital_status_data):
    marital_status = []
    for item in marital_status_data:
        class_ = str(item).lower()
        if class_ == "married-civilian spouse present" or class_ == "married -a f spouse present" or class_ == "married-spouse absent":
            marital_status.append("Married - together")
        elif class_ == "never married":
            marital_status.append("Never married")
        else:
            marital_status.append("Married - not together")
    return marital_status

def reduce_employment_stat(employment_stat_data):
    employment_status = []
    for item in employment_stat_data:
        class_ = str(item).lower()
        if class_ == "children or armed forces":
            employment_status.append("Children or Armed Forces")
        elif class_ == "full-time schedules":
            employment_status.append("Full-time schedules")
        elif class_ == "not in labor force":
            employment_status.append("Not in labor force")
        else:
            employment_status.append("Part-time/ Unemployed")
    return employment_status

def reduce_tax_filer_status(tax_filer_data):
    tax_filer_status = []
    for item in tax_filer_data:
        class_ = str(item).lower()
        if class_ == "joint both under 65":
            tax_filer_status.append("Joint - under 65")
        elif class_ == "single":
            tax_filer_status.append("Single")
        elif class_ == "nonfiler":
            tax_filer_status.append("Nonfiler")
        elif class_ == "head of household":
            tax_filer_status.append("Head of household")
        else:
            tax_filer_status.append("Joint - over 65")
    return tax_filer_status

def reduce_household_stat(household_data):
    household_family_status = []
    for item in household_data:
        class_ = str(item).lower()
        if class_ == "householder":
            household_family_status.append("Householder")
        elif class_ == "spouse of householder":
            household_family_status.append("Spouse of Householder")
        elif class_ == "nonfamily householder":
            household_family_status.append("Nonfamily Householder")
        elif class_[0:9] == "child <18":
            household_family_status.append("Child <18")
        elif class_[0:9] == "child 18+":
            household_family_status.append("Child 18+")
        elif class_ == "secondary individual":
            household_family_status.append("Other")
        elif class_ == "child under 18 of rp of unrel subfamily":
            household_family_status.append("Child <18")
        elif class_[0:14] == "grandchild <18":
            household_family_status.append("Child <18")
        elif class_[0:14] == "grandchild 18+":
            household_family_status.append("Child 18+")
        elif class_[0:13] == "other rel <18":
            household_family_status.append("Child <18")
        else:
            household_family_status.append("Other")
    return household_family_status

# Final encapsulating function:
def reduce_cardinality_of_categorical_columns(census_data):
    census_data['education'] = reduce_education(census_data['education'])
    census_data['marital status'] = reduce_marital_status(census_data['marital status'])
    census_data['full or part time employment stat'] = reduce_employment_stat(census_data['full or part time employment stat'])
    census_data['tax filer status'] = reduce_tax_filer_status(census_data['tax filer status'])
    census_data['detailed household and family stat'] = reduce_household_stat(census_data['detailed household and family stat'])
    return census_data

In [5]:
def combine_parents_self_birth_country(census_data):
    parents_and_self_birth = []
    father_birth = census_data['country of birth father'].tolist()
    mother_birth = census_data['country of birth mother'].tolist()
    self_birth = census_data['country of birth self'].tolist()
    for i in range(len(self_birth)):
        father = str(father_birth[i]).lower()
        mother = str(mother_birth[i]).lower()
        self_ = str(self_birth[i]).lower()
        if self_ == 'united-states':
            if father == mother and self_ == father:
                parents_and_self_birth.append("All US")
            else:
                parents_and_self_birth.append("Self US, Parents Other")
        else:
            parents_and_self_birth.append("Self Other")
    census_data['Self and Parents birth country'] = parents_and_self_birth
    census_data.drop(['country of birth father','country of birth mother','country of birth self'],inplace=True,axis=1)
    return census_data

In [6]:
def rename_and_reorder_columns(census_data):
    renamed_columns = ['age','worker class','education','wage per hour','enrolled/ in education',
                          'marital status','major industry code','major occupation code','labor union member',
                          'unemployment reason','employment status','capital gains','capital losses',
                          'stock dividends','tax filer status','household status','instance weight',
                          'lived here 1y ago','num worked for employer','own business or self-employed',
                          'veterans benefits','weeks worked in year','year','income','parents-self birth countries']
    census_data.columns = renamed_columns
    reordered_columns = renamed_columns[:-2] + [renamed_columns[-1],renamed_columns[-2]]
    census_data = census_data[reordered_columns]
    return census_data

In [7]:
def convert_int_category_to_object(column_data):
    new_columns_data = []
    for item in column_data:
        val = int(item)
        if val == 0:
            new_columns_data.append("Not In Universe")
        elif val == 1:
            new_columns_data.append("Yes")
        else:
            new_columns_data.append("No")
    return new_columns_data

def convert_all_int_categories_to_objects(census_data):
    census_data['own business or self-employed'] = convert_int_category_to_object(census_data['own business or self-employed'])
    census_data['veterans benefits'] = convert_int_category_to_object(census_data['veterans benefits'])
    census_data['year'] = [str(x) for x in census_data['year']]
    return census_data

In [8]:
def apply_all_one_hot_encoding(census_data):
    one_hot_columns = ['worker class','education','enrolled/ in education','marital status',
                       'labor union member','unemployment reason','employment status','tax filer status',
                       'household status','lived here 1y ago','own business or self-employed',
                       'veterans benefits','year','parents-self birth countries']
    numerical_columns = ['age','wage per hour','capital gains','capital losses','stock dividends',
                        'instance weight','num worked for employer','weeks worked in year']
    target_columns = ['income']
    hashing_encode_columns = ['major industry code','major occupation code']
    one_hot_census = pd.get_dummies(census_data, columns=one_hot_columns+hashing_encode_columns)
    return one_hot_census

In [13]:
def feature_engineer(census_data):
    data1 = feature_selection(census_data)
    data2 = fill_in_missing_values(data1)
    data3 = reduce_cardinality_of_categorical_columns(data2)
    data4 = combine_parents_self_birth_country(data3)
    data5 = rename_and_reorder_columns(data4)
    data6 = convert_all_int_categories_to_objects(data5)
    data7 = apply_all_one_hot_encoding(data6)
    return data7

High School Graduate          9589
College/ Associates Degree    8629
Bachelors Degree              6068
Other                         5017
Post-graduates Degree         4339
Children                      3740
Name: education, dtype: int64


Unnamed: 0,age,wage per hour,capital gains,capital losses,stock dividends,instance weight,num worked for employer,weeks worked in year,income,worker class_Federal government,...,major occupation code_Machine operators assmblrs & inspctrs,major occupation code_Not In Universe,major occupation code_Other service,major occupation code_Precision production craft & repair,major occupation code_Private household services,major occupation code_Professional specialty,major occupation code_Protective services,major occupation code_Sales,major occupation code_Technicians and related support,major occupation code_Transportation and material moving
0,48,1200,0,0,0,162.61,1,52,0,0,...,0,0,0,0,0,1,0,0,0,0
1,47,876,0,0,0,1661.53,5,52,0,0,...,0,0,0,0,0,0,0,0,0,0
2,56,500,0,0,0,1500.08,2,32,0,0,...,0,0,0,0,0,0,0,0,0,0
3,66,400,0,0,0,1212.48,2,52,0,0,...,1,0,0,0,0,0,0,0,0,0
4,42,0,0,0,0,949.12,2,52,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#final_data.to_csv("data/census_data_ready_for_models.csv",index=False)

KeyError: 'education'