In [22]:
import warnings
import tensorflow as tf
import numpy as np
import pandas as pd
import json

seed = 44

tf.compat.v1.random.set_random_seed(seed)
np.random.seed(seed)

In [25]:
df = pd.read_csv('../data/adult_cleaned.csv')
df

Unnamed: 0,age,education.num,capital.gain,capital.loss,hours.per.week,income,Federal-gov,Local-gov,Never-worked,Private,...,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Unknown_country,Vietnam,Yugoslavia
0,90,9,0,4356,40,<=50K,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,82,9,0,4356,18,<=50K,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,66,10,0,4356,40,<=50K,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,54,4,0,3900,40,<=50K,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,41,10,0,3900,40,<=50K,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,10,0,0,40,<=50K,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
32557,27,12,0,0,38,<=50K,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
32558,40,9,0,0,40,>50K,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
32559,58,9,0,0,40,<=50K,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [26]:
with open('../data/adult_constraints.json', 'r') as f:
    constraints = json.load(f) 
constraints

{'Federal-gov': ['onehot'],
 'Local-gov': ['onehot'],
 'Never-worked': ['onehot'],
 'Private': ['onehot'],
 'Self-emp-inc': ['onehot'],
 'Self-emp-not-inc': ['onehot'],
 'State-gov': ['onehot'],
 'Unknown_workclass': ['onehot'],
 'Without-pay': ['onehot'],
 'Divorced': ['onehot'],
 'Married-AF-spouse': ['onehot'],
 'Married-civ-spouse': ['onehot'],
 'Married-spouse-absent': ['onehot'],
 'Never-married': ['onehot'],
 'Separated': ['onehot'],
 'Widowed': ['onehot'],
 'Adm-clerical': ['onehot'],
 'Armed-Forces': ['onehot'],
 'Craft-repair': ['onehot'],
 'Exec-managerial': ['onehot'],
 'Farming-fishing': ['onehot'],
 'Handlers-cleaners': ['onehot'],
 'Machine-op-inspct': ['onehot'],
 'Other-service': ['onehot'],
 'Priv-house-serv': ['onehot'],
 'Prof-specialty': ['onehot'],
 'Protective-serv': ['onehot'],
 'Sales': ['onehot'],
 'Tech-support': ['onehot'],
 'Transport-moving': ['onehot'],
 'Unknown_occupation': ['onehot'],
 'Amer-Indian-Eskimo': ['onehot', 'non-actionable'],
 'Asian-Pac-Isl

In [30]:

X = df[set(df.columns) - set(['income'])]
Y = df['income']
Y = pd.get_dummies(Y)
Y

Unnamed: 0,<=50K,>50K
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
32556,1,0
32557,1,0
32558,0,1
32559,1,0


In [31]:
from tensorflow import keras


model = tf.keras.Sequential()
model.add(tf.keras.layers.Input((85,)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(2, activation='softmax'))


model.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

In [32]:
model.fit(
    X.to_numpy(), 
    Y.to_numpy(),
    epochs=100,
    batch_size=128,
    validation_split=0.3,
    shuffle=True,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=10)
    ]
)

Train on 22792 samples, validate on 9769 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


<tensorflow.python.keras.callbacks.History at 0x25a5500cc48>

In [33]:
model.save('../models/adult_NN.h5', overwrite=True, save_format='h5')

In [34]:
model.load_weights('../models/adult_NN.h5')

In [35]:
X.to_numpy()[0].shape

(85,)

In [36]:
model.predict(X.to_numpy()[0:2], verbose=1)




array([[0.37581608, 0.6241839 ],
       [0.37379995, 0.62620014]], dtype=float32)

In [37]:
model.predict_proba(X.to_numpy()[0:2])

array([[0.37581608, 0.6241839 ],
       [0.37379995, 0.62620014]], dtype=float32)

RandomForest

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)

In [39]:
classifier = RandomForestClassifier(random_state=seed, criterion='entropy')
classifier.fit(X_train, y_train)


RandomForestClassifier(criterion='entropy', random_state=44)

In [40]:
classifier.score(X_test, y_test)

0.8485003582761798

In [41]:
from joblib import dump, load

dump(classifier, '../models/adult_RF.joblib')

['../models/adult_RF.joblib']

In [42]:
clf = load('../models/adult_RF.joblib')

In [43]:
print(y_test[0:5])
clf.predict(X_test[0:5])

       <=50K  >50K
13099      1     0
21918      1     0
23550      1     0
6892       0     1
17909      0     1


array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1]], dtype=uint8)