In [1]:
import warnings
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from tensorflow import keras
import sklearn
from sklearn.model_selection import train_test_split
#from modules.CARLA import carla

print(str(tf.config.list_physical_devices()))
seed = 44

tf.random.set_seed(seed)
np.random.seed(seed)
tf.config.run_functions_eagerly(True)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
with open('../data/adult_constraints.json', 'r') as f:
    constr = json.load(f)

dataset = pd.read_csv("../data/adult_cleaned.csv")

X_train, X_test, Y_train, Y_test = train_test_split(dataset[constr['features_order_after_split']], dataset['income'], test_size=0.2, random_state=seed)
X_train.columns

Index(['age', 'education.num', 'capital.gain', 'capital.loss',
       'hours.per.week', 'workclass_?', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay',
       'marital.status_Divorced', 'marital.status_Married-AF-spouse',
       'marital.status_Married-civ-spouse',
       'marital.status_Married-spouse-absent', 'marital.status_Never-married',
       'marital.status_Separated', 'marital.status_Widowed', 'occupation_?',
       'occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machine-op-inspct', 'occupation_Other-service',
       'occupation_Priv-house-serv', 'occupation_Prof-specialty',
       'occupation_Protective-serv', 'occupation_Sales',
       'occupation_Tech

In [3]:
Y_train

20789     >50K
7665     <=50K
7003     <=50K
3368     <=50K
15072    <=50K
         ...  
16955    <=50K
25773     >50K
27377    <=50K
3491     <=50K
14100    <=50K
Name: income, Length: 26048, dtype: object

In [4]:

X_train = X_train.to_numpy()
Y_train = Y_train
Y_train = pd.get_dummies(Y_train).to_numpy()
Y_train

array([[0, 1],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [5]:
X_test = X_test.to_numpy()
Y_test = Y_test
Y_test = pd.get_dummies(Y_test).to_numpy()
Y_test

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [0, 1]], dtype=uint8)

In [6]:
X_train.shape

(26048, 85)

In [7]:
Y_train.shape

(26048, 2)

In [8]:
# sklearn minmaxscaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [9]:
constr['features_count_split_without_target']

85

In [10]:
class_weights = np.round(1 - Y_train.sum(axis=0) / Y_train.sum(), 4)
class_weights

array([0.2397, 0.7603])

In [11]:
tf_class_weights = {0: class_weights[0], 1: class_weights[1]}
tf_class_weights

{0: 0.2397, 1: 0.7603}

In [86]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input((constr['features_count_split_without_target'],)))
model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.ELU())
#model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.ELU())
model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.ELU())
#model.add(tf.keras.layers.Dropout(0.3))
#model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(2))
model.add(tf.keras.layers.Softmax())


model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    #optimizer=tf.keras.optimizers.RMSprop(),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

In [87]:
with tf.device("/GPU:0"):
    model.fit(
        X_train, 
        Y_train,
        epochs=100,
        batch_size=4096,
        validation_data=(X_test, Y_test),
        shuffle=True,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(patience=20)
        ],
        verbose=1,
        class_weight=tf_class_weights
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


In [69]:
model.save('../models/adult_NN.h5', overwrite=True, save_format='h5')

In [70]:
model.save('../models/adult_NN', overwrite=True)

INFO:tensorflow:Assets written to: ../models/adult_NN\assets


In [71]:
tf.data.experimental.enable_debug_mode()
model_test = tf.keras.models.load_model('../models/adult_NN.h5')

array([], shape=(0, 2), dtype=float32)

In [72]:
model.load_weights('../models/adult_NN.h5')

In [73]:
X_test[0].shape

(85,)

In [74]:
model.predict(X_test[0:2], verbose=1)




array([[0.8824439 , 0.11755612],
       [0.98558944, 0.01441056]], dtype=float32)

RandomForest

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from joblib import dump, load

In [None]:
# best_score = 0


# for crit in ['gini', 'entropy']:
#     for max_depth in [10, 30, None]: 
#         for estimators in [150, 200]:
#             for max_features in ['sqrt', 'log2']:
#                 for min_samples_leaf in [1,2,3]:
#                     for min_samples_split in [2,3,4]:
#                         for min_impurity_decrease in [0, 0.001, 0.01, 0.1]:
#                             classifier = RandomForestClassifier(
#                                 random_state=seed, 
#                                 criterion=crit, 
#                                 n_estimators=estimators,
#                                 verbose=0,
#                                 class_weight='balanced',
#                                 min_impurity_decrease=min_impurity_decrease,
#                                 min_samples_leaf=min_samples_leaf,
#                                 min_samples_split=min_samples_split,
#                                 max_depth=max_depth,
#                                 n_jobs=3,
#                                 max_features=max_features
#                                 )
#                             classifier.fit(X_train, Y_train)
#                             score = classifier.score(X_test, Y_test)
#                             if score > best_score:
#                                 best_score = score
#                                 print(f'New Best! Crit: {crit}, Max_depth: {max_depth}, Max_feat: {max_features}, Min_sampl_leaf: {min_samples_split} \
# MIn_samples_split {min_samples_split}, Min_impurity_dec: {min_impurity_decrease}, Estimators: {estimators} Score: {score}')

In [90]:
classifier = RandomForestClassifier(
                random_state=seed, 
                criterion='gini', 
                n_estimators=110,
                verbose=0,
                class_weight='balanced',
                min_impurity_decrease=0,
                max_depth=None,
                n_jobs=3,
                max_features='sqrt'
            )
classifier.fit(X_train, Y_train)
classifier.score(X_test, Y_test)

0.839705204974666

In [91]:
dump(classifier, '../models/adult_RF.joblib')

['../models/adult_RF.joblib']

In [92]:
clf = load('../models/adult_RF.joblib')

In [93]:
print(Y_test[0:5])
clf.predict(X_test[0:5])

[[1 0]
 [1 0]
 [1 0]
 [0 1]
 [0 1]]


array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1]], dtype=uint8)

In [94]:
import pickle

with open('../models/adult_RF.pkl', 'wb') as f:
    pickle.dump(classifier, f)

with open('../models/adult_RF.pkl', 'rb') as f:
    clf_pickl = pickle.load(f)

pred = clf_pickl.predict(X_test[0:5])
type(pred)

numpy.ndarray

In [95]:
clf_pickl.predict_proba(X_test[0:10])[0]

array([[0.04545455, 0.95454545],
       [0.02727273, 0.97272727],
       [0.        , 1.        ],
       [0.02727273, 0.97272727],
       [0.92727273, 0.07272727],
       [0.35585926, 0.64414074],
       [0.        , 1.        ],
       [0.27272727, 0.72727273],
       [0.78181818, 0.21818182],
       [0.21818885, 0.78181115]])