## Earning predictions

The aim of the project was to predict if a person would earn more than 50 000 USD per year. Neural network has been used. Results have been compared with classic machine learning models: random forest and xgboost.

In [2]:
import tensorflow as tf

import pandas as pd
import numpy as np
np.random.seed(2019)

from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

from functools import partial
import os
import warnings
warnings.filterwarnings('ignore')

#### Data loading and preparation

In [7]:
df = pd.read_hdf('train.adult.h5')
df['target'] = df['Target'].factorize()[0]
df = df.fillna(-1)
df = df.drop_duplicates()
df = df.reset_index()

In [8]:
def feat_eng(df):
    cat_feats = df.select_dtypes(include=[np.object]).columns
    for cat_feat in cat_feats:
        df['{0}_cat'.format(cat_feat)] = pd.factorize(df[cat_feat])[0]
    df['White'] = df['Race'].apply(lambda x: 1 if x == 'White' else 0)
    df['Black'] = df['Race'].apply(lambda x: 1 if x == 'Black' else 0)
    df['Other_race'] = df['Race'].apply(
        lambda x: 1 if (x != 'White') & (x != 'Black') else 0)
    df['Extra_hours'] = df['Hours per week'].map(lambda x: 1 if x > 40 else 0)
    df['Extra_hours_num'] = df['Hours per week'].map(
        lambda x: x-40 if x > 40 else 0)
    df['Husband'] = df['Relationship'].apply(
        lambda x: 1 if x == 'Husband' else 0)
    df['Married-civ-spouse'] = df['Martial Status'].apply(
        lambda x: 1 if x == 'Married-civ-spouse' else 0)
    df['Never-married'] = df['Martial Status'].apply(
        lambda x: 1 if x == 'Never-married' else 0)
    df['Country_us'] = df['Country'].apply(
        lambda x: 1 if x == 'United-States' else 0)
    df['Country_other'] = df['Country'].apply(
        lambda x: 1 if (x != 'United-States') else 0)
    df['Occ_white'] = pd.factorize(df[['Occupation_cat', 'White']].apply(
        lambda x: '{0}-{1}'.format(x['Occupation_cat'], x['White']), axis=1))[0]
    df['Occ_other'] = pd.factorize(df[['Occupation_cat', 'Other_race']].apply(
        lambda x: '{0}-{1}'.format(x['Occupation_cat'], x['Other_race']), axis=1))[0]
    df['Productive_age'] = df['Age'].apply(
        lambda x: 1 if (x >= 24) & (x <= 70) else 0)
    df['Master_bachelor'] = df['Education'].apply(
        lambda x: 1 if (x == 'Bachelors') | (x == 'Masters') else 0)
    df['Doctor_prof'] = df['Education'].apply(
        lambda x: 1 if (x == 'Prof-school') | (x == 'Doctorate') else 0)
    df['White_husband'] = df[['Relationship', 'Race']].apply(lambda x: 1 if (
        x['Relationship'] == 'Husband') & (x['Race'] == 'White') else 0, axis=1)
    df['Black_husband'] = df[['Relationship', 'Race']].apply(lambda x: 1 if (
        x['Relationship'] == 'Husband') & (x['Race'] == 'Black') else 0, axis=1)
    df['Occ_sex'] = pd.factorize(df[['Occupation_cat', 'Sex']].apply(
        lambda x: '{0}-{1}'.format(x['Occupation_cat'], x['Sex']), axis=1))[0]
    df['Occ_rel_sex'] = pd.factorize(df[['Occupation_cat', 'Relationship_cat', 'Sex']].apply(
        lambda x: '{0}-{1}-{2}'.format(x['Occupation_cat'], x['Relationship_cat'], x['Sex']), axis=1))[0]
    df['Married_productive'] = pd.factorize(df[['Married-civ-spouse', 'Productive_age']].apply(
        lambda x: '{0}-{1}'.format(x['Married-civ-spouse'], x['Productive_age']), axis=1))[0]
    df['Occ_martial'] = pd.factorize(df[['Occupation_cat', 'Martial Status']].apply(
        lambda x: '{0}-{1}'.format(x['Occupation_cat'], x['Martial Status']), axis=1))[0]
    df['Educ_martial'] = df[['Education', 'Martial Status']].apply(lambda x: 1 if (
        ((x['Education'] == 'Assoc-voc') | (x['Education'] == 'Bachelors')) & (x['Martial Status'] == 'Married-AF-spouse')) else 0, axis=1)
    df['fnlwgt_log'] = np.log2(df['fnlwgt']+1)
    return df


In [9]:
df_full = feat_eng(df)

### Neural network

Neural network has been build in order to compare result with previous models. Data has been transformed and normalized to fulfill requirements of neural network.

In [10]:
feats = ['Age', 'Education-Num', 'Extra_hours',
        'Husband','Married-civ-spouse','Never-married',
        'Occupation_cat','Occ_white','Occ_other',
        'Master_bachelor','Doctor_prof','Productive_age',
        'Capital Gain', 'Capital Loss','Relationship_cat',
        'White_husband','Black_husband',
        'Sex','White','Black','Other_race',
        'Country_us','Country_other','fnlwgt_log',
        'Occ_sex','Occ_rel_sex','Married_productive',
        'Occ_martial','Martial Status_cat', 'Educ_martial']

df_all = df_full[feats]
y_all = df_full['target']

Categorical features for one-hot encoding:

In [11]:
features = [ 'Education-Num', 'Husband', 'Married-civ-spouse',
           'Never-married', 'Occupation_cat', 'Occ_white', 'Occ_other',
           'Master_bachelor', 'Doctor_prof', 'Productive_age', 'Relationship_cat',
           'White_husband', 'Black_husband','Sex', 'White', 'Black', 'Other_race',
           'Country_us', 'Country_other', 'Occ_sex', 'Occ_rel_sex', 'Married_productive',
           'Occ_martial', 'Martial Status_cat', 'Educ_martial']

In [12]:
for feat in features:
    df_all[feat] = df_all[feat].astype('object')
    
cat = df_all.select_dtypes(include=['object']).columns
num = df_all.select_dtypes(exclude=['object']).columns

In [13]:
df_all[num] = StandardScaler().fit_transform(df_all[num])

for feat in cat:
    df_all = pd.concat([df_all, pd.get_dummies(df_all[feat].astype('category'))], axis=1)

In [14]:
for f in cat:
    del df_all[f]

In [15]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32537 entries, 0 to 32536
Columns: 413 entries, Age to 1
dtypes: float64(5), uint8(408)
memory usage: 13.9 MB


Dataframe has been split into train, test and valid set:

In [16]:
X_train, X_t, y_train, y_t = train_test_split(df_all, y_all, test_size=0.3, random_state=2018) 

X_train = X_train.reset_index()
y_train = y_train.reset_index()
X_t = X_t.reset_index()
y_t = y_t.reset_index()

del X_train['index']
del X_t['index']
del y_train['index']
del y_t['index']

X_test, X_valid, y_test, y_valid = train_test_split(X_t, y_t, test_size=0.5, random_state=2018) 

X_test = X_test.reset_index()
y_test = y_test.reset_index()
X_valid = X_valid.reset_index()
y_valid = y_valid.reset_index()

del X_valid['index']
del X_test['index']
del y_test['index']
del y_valid['index']

print(X_train.shape, ' ', y_train.shape)
print(X_valid.shape, ' ', y_valid.shape)
print(X_test.shape, ' ', y_test.shape)

(22775, 413)   (22775, 1)
(4881, 413)   (4881, 1)
(4881, 413)   (4881, 1)


### Deep neural network models

In [3]:
tf.reset_default_graph()

In [4]:
n_inputs = X_train.shape[1]
n_hidden1 = 300
n_hidden2 = 200
n_hidden3 = 100
n_outputs = 2

In [24]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y") 

In [25]:
with tf.name_scope("gsn"):
    he_init = tf.contrib.layers.variance_scaling_initializer()
    hidden1 = tf.layers.dense(X, n_hidden1, name="h1",
                              activation=tf.nn.elu, kernel_initializer=he_init)
    hidden2 = tf.layers.dense(hidden1, n_hidden2, name="h2",
                              activation=tf.nn.elu, kernel_initializer=he_init)
    hidden3 = tf.layers.dense(hidden2, n_hidden3, name="h3",
                              activation=tf.nn.elu, kernel_initializer=he_init)
    logits = tf.layers.dense(hidden3, n_outputs, name="out")

In [26]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    loss_summary = tf.summary.scalar('log_loss', loss)

In [27]:
learning_rate = 0.01

with tf.name_scope("learn"):
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True)
    training_op = optimizer.minimize(loss)

In [28]:
with tf.name_scope("estimation"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    accuracy_summary = tf.summary.scalar('accuracy', accuracy)

In [29]:
from datetime import datetime

def log_dir(prefix=""):
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    root_logdir = "tf_board"
    if prefix:
        prefix += "-"
    name = prefix + "run-" + now
    return "{}/{}/".format(root_logdir, name)

In [30]:
logdir = log_dir("earning_gsn")
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [31]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [32]:
def next_batch(x_=X_train.values, y_=y_train.values.ravel(), batch_size=300, batch_index=2, epocha=1):
    
    tf.set_random_seed(epoch*len(x_)//batch_size + batch_index)
    np.random.seed(epocha*len(x_)//batch_size + batch_index)
    
    indices = np.random.randint(len(x_), size=batch_size) 
    np.random.shuffle(indices)
    
    return x_[indices], y_[indices]

In [35]:
n_epochs = 50
n_batches = 100
batch_s = X_train.shape[0]//n_batches

checkpoint_path = "/tmp/my_gsn_earning.ckpt"
checkpoint_epoch_path = checkpoint_path + ".epoch"
final_model_path = "./my_gsn_earning"

best_loss = np.infty
epochs_without_progress = 0
max_epochs_without_progress = 10

with tf.Session() as sess:
    if os.path.isfile(checkpoint_epoch_path):
        with open(checkpoint_epoch_path, "rb") as f:
            start_epoch = int(f.read())
        print("Learn interrupted. Back to epoch", start_epoch)
        saver.restore(sess, checkpoint_path)
    else:
        start_epoch = 0
        sess.run(init)
        
    for epoch in range(start_epoch, n_epochs):
        for iteration in range(n_batches ):
            X_batch, y_batch = next_batch(batch_index=iteration, epocha=epoch, batch_size=batch_s)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        
        accuracy_val, loss_val, accuracy_summary_str, loss_summary_str = sess.run([accuracy, loss, accuracy_summary, loss_summary],
                                                                                  feed_dict={X:X_valid.values, y:y_valid.values.ravel()})
        file_writer.add_summary(accuracy_summary_str, epoch)
        file_writer.add_summary(loss_summary_str, epoch)
        if epoch % 2 == 0:
            print("Epoch:", epoch,
                  "\tValidation: {:.3f}%".format(accuracy_val * 100),"\tLoss: {:.5f}".format(loss_val))
            saver.save(sess, checkpoint_path)
            with open(checkpoint_epoch_path, "wb") as f:
                f.write(b"%d" % (epoch + 1))
            if loss_val < best_loss:
                saver.save(sess, final_model_path)
                best_loss = loss_val
            else:
                epochs_without_progress += 2
                if epochs_without_progress > max_epochs_without_progress:
                    print("Early stop")
                    break

Epoch: 0 	Validation: 84.696% 	Loss: 0.33120
Epoch: 2 	Validation: 84.860% 	Loss: 0.32741
Epoch: 4 	Validation: 84.204% 	Loss: 0.32549
Epoch: 6 	Validation: 85.085% 	Loss: 0.31937
Epoch: 8 	Validation: 85.187% 	Loss: 0.31897
Epoch: 10 	Validation: 85.515% 	Loss: 0.31955
Epoch: 12 	Validation: 85.208% 	Loss: 0.31484
Epoch: 14 	Validation: 85.065% 	Loss: 0.31884
Epoch: 16 	Validation: 84.942% 	Loss: 0.31810
Epoch: 18 	Validation: 85.597% 	Loss: 0.31434
Epoch: 20 	Validation: 85.392% 	Loss: 0.31575
Epoch: 22 	Validation: 85.413% 	Loss: 0.31795
Epoch: 24 	Validation: 85.392% 	Loss: 0.31588
Early stop


In [36]:
os.remove(checkpoint_epoch_path)
with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    accuracy_val = accuracy.eval(feed_dict={X: X_test.values, y: y_test.values.ravel()})
print(accuracy_val)

INFO:tensorflow:Restoring parameters from ./my_gsn_earning
0.85576725


Results are comparable, however a little bit worse than using RandomForestClassifier (about 86% of accuracy) and XGBClassifier (about 87% of accuracy). 

### Dropout

In [38]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y") 

logdir = log_dir("earning_gsn")
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

training = tf.placeholder_with_default(False, shape=(), name='learn')

dropout_rate = 0.6 
X_drop = tf.layers.dropout(X, dropout_rate, training=training)

with tf.name_scope("gsn"):
    he_init = tf.contrib.layers.variance_scaling_initializer()
    hidden1 = tf.layers.dense(X, n_hidden1, name="h1",
                              activation=tf.nn.elu,kernel_initializer=he_init)
    hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)

    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, name="h2",
                              activation=tf.nn.elu,kernel_initializer=he_init)
    hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)

    hidden3 = tf.layers.dense(hidden2_drop, n_hidden3, name="h3",
                              activation=tf.nn.elu, kernel_initializer=he_init)
    logits = tf.layers.dense(hidden3, n_outputs, name="output")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    loss_summary = tf.summary.scalar('log_loss', loss)
    
learning_rate = 0.01

with tf.name_scope("learn"):
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                       momentum=0.9, use_nesterov=True)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("validation"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    accuracy_summary = tf.summary.scalar('accuracy', accuracy)

In [39]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 50
n_batches = 100
batch_s=X_train.shape[0] // n_batches

checkpoint_path = "/tmp/my_gsn_earning.ckpt"
checkpoint_epoch_path = checkpoint_path + ".epoch"
final_model_path = "./my_gsn_earning"

best_loss = np.infty
epochs_without_progress = 0
max_epochs_without_progress = 10

with tf.Session() as sess:  
    if os.path.isfile(checkpoint_epoch_path):
        with open(checkpoint_epoch_path, "rb") as f:
            start_epoch = int(f.read())
        print("Learn interrupted. Back to epoch", start_epoch)
        saver.restore(sess, checkpoint_path)
    else:
        start_epoch = 0
        sess.run(init)        
    for epoch in range(start_epoch, n_epochs):
        for iteration in range(n_batches):
            X_batch, y_batch = next_batch(batch_index=iteration,epocha=epoch,batch_size=batch_s)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val, loss_val, accuracy_summary_str, loss_summary_str = sess.run([accuracy, loss, accuracy_summary, loss_summary],
                                                                                  feed_dict={X:X_valid.values, y:y_valid.values.ravel()})

        file_writer.add_summary(accuracy_summary_str, epoch)
        file_writer.add_summary(loss_summary_str, epoch)
        if epoch % 2 == 0:
            print("Epoch:", epoch,
                  "\tValidation: {:.3f}%".format(accuracy_val * 100),
                  "\tLoss: {:.5f}".format(loss_val))
            saver.save(sess, checkpoint_path)
            with open(checkpoint_epoch_path, "wb") as f:
                f.write(b"%d" % (epoch + 1))
            if loss_val < best_loss:
                saver.save(sess, final_model_path)
                best_loss = loss_val
            else:
                epochs_without_progress += 2
                if epochs_without_progress > max_epochs_without_progress:
                    print("Early stop")
                    break

Epoch: 0 	Validation: 84.532% 	Loss: 0.33293
Epoch: 2 	Validation: 84.778% 	Loss: 0.32966
Epoch: 4 	Validation: 84.163% 	Loss: 0.32873
Epoch: 6 	Validation: 85.003% 	Loss: 0.32100
Epoch: 8 	Validation: 85.208% 	Loss: 0.31999
Epoch: 10 	Validation: 85.290% 	Loss: 0.32120
Epoch: 12 	Validation: 85.474% 	Loss: 0.31580
Epoch: 14 	Validation: 85.310% 	Loss: 0.32002
Epoch: 16 	Validation: 84.860% 	Loss: 0.31989
Epoch: 18 	Validation: 85.720% 	Loss: 0.31539
Epoch: 20 	Validation: 85.351% 	Loss: 0.31734
Epoch: 22 	Validation: 85.146% 	Loss: 0.31961
Epoch: 24 	Validation: 85.433% 	Loss: 0.31667
Early stop


In [40]:
os.remove(checkpoint_epoch_path)
with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    accuracy_val = accuracy.eval(feed_dict={X: X_test.values, y: y_test.values.ravel()})
accuracy_val

INFO:tensorflow:Restoring parameters from ./my_gsn_earning


0.8543331

Results with dropout are not better than using initial model.

### Batch normalization

In [42]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y") 
batch_norm_momentum = 0.9

logdir = log_dir("earning_gsn")
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

training = tf.placeholder_with_default(False, shape=(), name='learn')

with tf.name_scope("gsn"):
    he_init = tf.contrib.layers.variance_scaling_initializer()
    my_batch_norm_layer = partial(tf.layers.batch_normalization,
            training=training, momentum=batch_norm_momentum)
    my_dense_layer = partial(tf.layers.dense, kernel_initializer=he_init)

    hidden1 = my_dense_layer(X, n_hidden1, name="h1")
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name="h2")
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    hidden3 = my_dense_layer(bn2, n_hidden3, name="h3")
    bn3 = tf.nn.elu(my_batch_norm_layer(hidden3))
    logits_before_bn = my_dense_layer(bn3, n_outputs, name="output")
    logits = my_batch_norm_layer(logits_before_bn)    
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    loss_summary = tf.summary.scalar('log_loss', loss)
    
learning_rate = 0.01

with tf.name_scope("learn"):
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("validation"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    accuracy_summary = tf.summary.scalar('accuracy', accuracy)

In [43]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 50
n_batches = 100
batch_s=X_train.shape[0] // n_batches

checkpoint_path = "/tmp/my_gsn_earning.ckpt"
checkpoint_epoch_path = checkpoint_path + ".epoch"
final_model_path = "./my_gsn_earning"

best_loss = np.infty
epochs_without_progress = 0
max_epochs_without_progress = 10

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:    
    if os.path.isfile(checkpoint_epoch_path):
        with open(checkpoint_epoch_path, "rb") as f:
            start_epoch = int(f.read())
        print("Learn interrupted. Back to epoch", start_epoch)
        saver.restore(sess, checkpoint_path)
    else:
        start_epoch = 0
        sess.run(init)
    for epoch in range(start_epoch, n_epochs):
        for iteration in range(n_batches):
            X_batch, y_batch = next_batch(batch_index=iteration,epocha=epoch,batch_size=batch_s)
            sess.run([training_op, extra_update_ops], feed_dict={training: True,X: X_batch, y: y_batch})
        accuracy_val, loss_val, accuracy_summary_str, loss_summary_str = sess.run([accuracy, loss, accuracy_summary, loss_summary], 
                                                                                  feed_dict={X:X_valid.values, y:y_valid.values.ravel()})
        file_writer.add_summary(accuracy_summary_str, epoch)
        file_writer.add_summary(loss_summary_str, epoch)
        if epoch % 2 == 0:
            print("Epoch:", epoch,
                  "\tValidation: {:.3f}%".format(accuracy_val * 100),
                  "\tLoss: {:.5f}".format(loss_val))
            saver.save(sess, checkpoint_path)
            with open(checkpoint_epoch_path, "wb") as f:
                f.write(b"%d" % (epoch + 1))
            if loss_val < best_loss:
                saver.save(sess, final_model_path)
                best_loss = loss_val
            else:
                epochs_without_progress += 2
                if epochs_without_progress > max_epochs_without_progress:
                    print("Early stop")
                    break

Epoch: 0 	Validation: 84.798% 	Loss: 0.33731
Epoch: 2 	Validation: 84.737% 	Loss: 0.33159
Epoch: 4 	Validation: 84.634% 	Loss: 0.32991
Epoch: 6 	Validation: 84.675% 	Loss: 0.32647
Epoch: 8 	Validation: 84.942% 	Loss: 0.32500
Epoch: 10 	Validation: 85.065% 	Loss: 0.32630
Epoch: 12 	Validation: 85.187% 	Loss: 0.32355
Epoch: 14 	Validation: 84.962% 	Loss: 0.32935
Epoch: 16 	Validation: 85.310% 	Loss: 0.32451
Epoch: 18 	Validation: 84.839% 	Loss: 0.32873
Epoch: 20 	Validation: 84.614% 	Loss: 0.33626
Epoch: 22 	Validation: 84.614% 	Loss: 0.33357
Early stop


In [44]:
os.remove(checkpoint_epoch_path)
with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    accuracy_val = accuracy.eval(feed_dict={X: X_test.values, y: y_test.values.ravel()})
accuracy_val

INFO:tensorflow:Restoring parameters from ./my_gsn_earning


0.8531039

Batch normalization has no significant impact on result in this case. 