# CS395 - Final Project
### Classifying Income From Census Data

Date: April 14th, 2019
By: Joshua Swick and Lauren Simms

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import feature_column
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

## Load and prepare dataset

In [11]:
census_data_file = "data/census_data.csv"
headers = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income'
]
dataset = pd.read_csv(census_data_file, names=headers)

In [12]:
print(dataset.shape)

(32561, 15)


In [13]:
print(dataset.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

### Convert target column to integer

We are trying to classify individuals into incomes above or below $50k. The value provided in the income cata

In [14]:
dataset['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [15]:
list=[]
for income in dataset['income']:
    if income == ' <=50K':
        list.append(0)
    if income == ' >50K':
        list.append(1)

dataset['income'] = list

### Normalize numeric features

In [16]:
columns_to_normalize = [
    'age',
    'education-num',
    'capital-gain',
    'capital-loss',
    'hours-per-week'
]
dataset[columns_to_normalize] = dataset[columns_to_normalize].apply(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

In [17]:
age_feature = tf.feature_column.numeric_column("age")
education_num_feature = tf.feature_column.numeric_column("education_num")
capital_gain_feature = tf.feature_column.numeric_column("capital_gain")
capital_loss_feature = tf.feature_column.numeric_column("capital_loss")
hours_worked_feature = tf.feature_column.numeric_column("hours_per_week")

### Encode categorical features

In [82]:
columns_to_encode = [
    'education',
    'workclass',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country'
]

education_types = {}
education_num = 0
workclass_types = {}
workclass_num = 0
marital_status_types = {}
marital_status_num = 0
occupation_types = {}
occupation_num = 0
relationship_types = {}
relationship_num = 0
race_types = {}
race_num = 0
sex_types = {}
sex_num = 0
native_country_types = {}
native_country_num = 0

for index, row in dataset[columns_to_encode].iterrows():
    # Education
    edu = row['education']
    if edu not in education_types:
        education_types[edu] = education_num
        education_num += 1
    # Workclass
    wrk = row['workclass']
    if wrk not in workclass_types:
        workclass_types[wrk] = workclass_num
        workclass_num += 1
    # Marital Status
    ms = row['marital-status']
    if ms not in marital_status_types:
        marital_status_types[ms] = marital_status_num
        marital_status_num += 1
    # Occupation
    occ = row['occupation']
    if occ not in occupation_types:
        occupation_types[occ] = occupation_num
        occupation_num += 1
    # Relationship
    rel = row['relationship']
    if rel not in relationship_types:
        relationship_types[rel] = relationship_num
        relationship_num += 1
    # Race
    race = row['race']
    if race not in race_types:
        race_types[race] = race_num
        race_num += 1
    # Sex
    sex = row['sex']
    if sex not in sex_types:
        sex_types[sex] = sex_num
        sex_num += 1
    # Native Country
    nc = row['native-country']
    if nc not in native_country_types:
        native_country_types[nc] = native_country_num
        native_country_num += 1
        
        
edu_list = []
for education in dataset['education']:
    edu_list.append(education_types[education])
dataset['education'] = edu_list

wrk_list = []
for workclass in dataset['workclass']:
    wrk_list.append(workclass_types[workclass])
dataset['workclass'] = wrk_list 

ms_list = []
for marital_status in dataset['marital-status']:
    ms_list.append(marital_status_types[marital_status])
dataset['marital-status'] = ms_list

occ_list = []
for occupation in dataset['occupation']:
    occ_list.append(occupation_types[occupation])
dataset['occupation'] = occ_list

rel_list = []
for relationship in dataset['relationship']:
    rel_list.append(relationship_types[relationship])
dataset['relationship'] = rel_list

race_list = []
for race in dataset['race']:
    race_list.append(race_types[race])
dataset['race'] = race_list

sex_list = []
for sex in dataset['sex']:
    sex_list.append(sex_types[sex])
dataset['sex'] = sex_list

nc_list = []
for native_country in dataset['native-country']:
    nc_list.append(native_country_types[native_country])
dataset['native-country'] = nc_list


dataset[columns_to_encode].head()

Unnamed: 0,education,workclass,marital-status,occupation,relationship,race,sex,native-country
0,0,0,0,0,0,0,0,0
1,0,1,1,1,1,0,0,0
2,1,2,2,2,0,0,0,0
3,2,2,1,2,1,1,0,0
4,0,2,1,3,2,1,1,1


## Define Training and Test Data

In [83]:
data = dataset.drop("income", axis=1)
labels = dataset["income"]

In [84]:
training_data, testing_data, training_labels, testing_lables = train_test_split(data, labels, test_size=0.2)

In [85]:
print(len(training_data), 'train examples')
print(len(testing_data), 'test examples')

26048 train examples
6513 test examples


In [86]:
training_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,native-country.1
24645,0.150685,2,162298,1,0.533333,0,6,3,0,0,0.0,0.0,0.397959,0,0
19112,0.739726,1,130436,13,0.066667,2,6,0,0,1,0.0,0.0,0.27551,0,0
11592,0.260274,2,181721,12,0.333333,0,8,3,1,0,0.0,0.0,0.602041,0,0
5755,0.534247,2,182460,1,0.533333,1,2,1,0,0,0.0,0.0,0.397959,0,0
31139,0.136986,2,215504,0,0.8,1,5,1,0,0,0.0,0.424242,0.55102,0,0


In [97]:
training_data.shape

(26048, 15)

### Build the model

In [111]:
model = keras.Sequential([
    keras.layers.Dense(16, input_shape=(15,), activation=tf.nn.relu),
    keras.layers.Dense(2, activation=tf.nn.softmax)
])

In [112]:
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [113]:
model.fit(
    training_data,
    to_categorical(training_labels),
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8240333d30>