# CS395 - Final Project
###

Date: April 14th, 2019
By: Joshua Swick and Lauren Simms

In [119]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import feature_column
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

## Load and prepare dataset

In [47]:
census_data_file = "data/census_data.csv"
headers = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income'
]
dataset = pd.read_csv(census_data_file, names=headers)

In [34]:
print(dataset.shape)

(32561, 15)


In [102]:
print(dataset.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

### Convert target column to integer

We are trying to classify individuals into incomes above or below $50k. The value provided in the income cata

In [103]:
dataset['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [104]:
list=[]
for income in dataset['income']:
    if income == ' <=50K':
        list.append(0)
    if income == ' >50K':
        list.append(1)

dataset['income'] = list

### Normalize numeric columns

In [105]:
columns_to_normalize = [
    'age',
    'education-num',
    'capital-gain',
    'capital-loss',
    'hours-per-week'
]
dataset[columns_to_normalize] = dataset[columns_to_normalize].apply(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

In [106]:
age_feature = tf.feature_column.numeric_column("age")
education_num_feature = tf.feature_column.numeric_column("education_num")
capital_gain_feature = tf.feature_column.numeric_column("capital_gain")
capital_loss_feature = tf.feature_column.numeric_column("capital_loss")
hours_worked_feature = tf.feature_column.numeric_column("hours_per_week")

### Categorical Features

In [107]:
education_feature = tf.feature_column.categorical_column_with_hash_bucket("education",hash_bucket_size=16)
workclass_feature = tf.feature_column.categorical_column_with_hash_bucket("workclass",hash_bucket_size=10)
martial_status_feature = tf.feature_column.categorical_column_with_hash_bucket("martial-status",hash_bucket_size=7)
occupation_feature = tf.feature_column.categorical_column_with_hash_bucket("occupation",hash_bucket_size=14)
relationship_feature = tf.feature_column.categorical_column_with_hash_bucket("relationship",hash_bucket_size=6)
race_feature = tf.feature_column.categorical_column_with_hash_bucket("race",hash_bucket_size=5)
gender_feature = tf.feature_column.categorical_column_with_hash_bucket("gender",hash_bucket_size=2)
native_country_feature = tf.feature_column.categorical_column_with_hash_bucket("native_country",hash_bucket_size=60)

In [108]:
feature_columns = [
    age_feature,
    education_feature,
    workclass_feature,
    martial_status_feature,
    occupation_feature,
    relationship_feature,
    race_feature,
    gender_feature,
    native_country_feature,
    education_num_feature,
    capital_gain_feature,
    capital_loss_feature,
    hours_worked_feature
]

## Define Training and Test Data

In [111]:
data = dataset.drop("income", axis=1)
labels = dataset["income"]

In [112]:
training_data, testing_data, training_labels, testing_lables = train_test_split(data, labels, test_size=0.2)

In [113]:
print(len(training_data), 'train examples')
print(len(testing_data), 'test examples')

26048 train examples
6513 test examples


In [114]:
training_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
9641,0.342466,Private,121055,Some-college,0.6,Never-married,Sales,Not-in-family,White,Male,0.0,0.0,0.44898,United-States
10365,0.219178,Self-emp-not-inc,295621,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0.0,0.0,0.244898,United-States
2952,0.068493,?,35448,HS-grad,0.533333,Married-civ-spouse,?,Wife,White,Female,0.0,0.0,0.214286,United-States
28367,0.30137,Private,272338,HS-grad,0.533333,Married-civ-spouse,Adm-clerical,Wife,White,Female,0.0,0.0,0.244898,United-States
10802,0.479452,Private,75839,Some-college,0.6,Married-civ-spouse,Tech-support,Husband,White,Male,0.0,0.0,0.397959,United-States


### Build the model

In [134]:
model = keras.Sequential([
    keras.layers.Dense(16, input_shape=training_data.shape, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.softmax)
])

In [136]:
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [137]:
model.fit(
    training_data,
    keras.utils.to_categorical(training_labels),
    epochs=5
)

ValueError: Error when checking input: expected dense_20_input to have 3 dimensions, but got array with shape (26048, 14)