<a href="https://colab.research.google.com/github/hiydavid/homl-learning/blob/main/census_income.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification with Neural Decision Forests
##  United States Census Income Dataset

Resource: [Keras Tutorial](https://keras.io/examples/structured_data/deep_neural_decision_forests/)


---
# Load data & libraries

In [1]:
# load libraries
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
import math

In [2]:
# setup feature headers
CSV_HEADER = [
    "age", "workclass", "fnlwgt", "education", "education_num", 
    "marital_status", "occupation", "relationship", "race", 
    "gender", "capital_gain", "capital_loss", "hours_per_week", 
    "native_country", "income_bracket"
]

# get training data
train_data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
    header=None, 
    names=CSV_HEADER
)

# get testing data
test_data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", 
    header=None, 
    names=CSV_HEADER
)

# see shapes
print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (32561, 15)
Test dataset shape: (16282, 15)


In [3]:
# check train data
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# check test data
test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [5]:
# remove first row of test set
test_data = test_data[1:]

# remove "." from income_bracket
test_data.income_bracket = test_data.income_bracket.apply(
    lambda value: value.replace(".", "")
)

In [6]:
# save data locally as CSVs
train_data.to_csv("train_data.csv", index=False, header=False)
test_data.to_csv("test_data.csv", index=False, header=False)

---
# Define dataset metadata

In [7]:
# setup list of the numerical feature names.
NUMERIC_FEATURE_NAMES = [
    "age", "education_num", "capital_gain", "capital_loss", "hours_per_week",
]

NUMERIC_FEATURE_NAMES

['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [8]:
# A dictionary of the categorical features and their vocabulary.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "workclass": sorted(list(train_data["workclass"].unique())),
    "education": sorted(list(train_data["education"].unique())),
    "marital_status": sorted(list(train_data["marital_status"].unique())),
    "occupation": sorted(list(train_data["occupation"].unique())),
    "relationship": sorted(list(train_data["relationship"].unique())),
    "race": sorted(list(train_data["race"].unique())),
    "gender": sorted(list(train_data["gender"].unique())),
    "native_country": sorted(list(train_data["native_country"].unique())),
}

CATEGORICAL_FEATURES_WITH_VOCABULARY['workclass']

[' ?',
 ' Federal-gov',
 ' Local-gov',
 ' Never-worked',
 ' Private',
 ' Self-emp-inc',
 ' Self-emp-not-inc',
 ' State-gov',
 ' Without-pay']

In [9]:
# A list of the columns to ignore from the dataset.
IGNORE_COLUMN_NAMES = ["fnlwgt"]

In [10]:
# A list of the categorical feature names.
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
CATEGORICAL_FEATURE_NAMES

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'native_country']

In [11]:
# A list of all the input features.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

In [12]:
# A list of column default values for each feature.
COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
    for feature_name in CSV_HEADER
]

COLUMN_DEFAULTS

[[0.0],
 ['NA'],
 [0.0],
 ['NA'],
 [0.0],
 ['NA'],
 ['NA'],
 ['NA'],
 ['NA'],
 ['NA'],
 [0.0],
 [0.0],
 [0.0],
 ['NA'],
 ['NA']]

In [16]:
# The name of the target feature.
TARGET_FEATURE_NAME = "income_bracket"

---
# Create tf.data.Dataset objects for training and validation

In [18]:
from tensorflow.keras.layers import StringLookup

# maps strings from a vocabulary to integer indices
target_label_lookup = StringLookup(
    vocabulary=[" <=50K", " >50K"], 
    mask_token=None, 
    num_oov_indices=0
)

In [20]:
# function to get data from csv file and save as ds object
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(lambda features, target: (features, target_label_lookup(target)))
    return dataset.cache()

---
# Create model inputs

In [26]:
# create input object for model
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, 
                shape=(), 
                dtype=tf.float32
            )

        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, 
                shape=(), 
                dtype=tf.string
            )
    
    return inputs

---
# Encode input features

In [21]:
# function to encode inputs
def encode_inputs(inputs):
    encoded_features = []
    for feature_name in inputs:

        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            lookup = StringLookup(
                vocabulary=vocabulary, 
                mask_token=None, 
                num_oov_indices=0
            )
            value_index = lookup(inputs[feature_name])
            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
            embedding = layers.Embedding(
                input_dim=lookup.vocabulary_size(), 
                output_dim=embedding_dims
            )
            encoded_feature = embedding(value_index)

        else:
            encoded_feature = inputs[feature_name]
            if inputs[feature_name].shape[-1] is None:
                encoded_feature = tf.expand_dims(encoded_feature, -1)

        encoded_features.append(encoded_feature)

    encoded_features = layers.concatenate(encoded_features)
    return encoded_features

---
# Deep Neural Decision Tree

In [22]:
# NDT class object
class NeuralDecisionTree(keras.Model):
    def __init__(self, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionTree, self).__init__()
        self.depth = depth
        self.num_leaves = 2 ** depth
        self.num_classes = num_classes

        # Create a mask for the randomly selected features.
        num_used_features = int(num_features * used_features_rate)
        one_hot = np.eye(num_features)
        sampled_feature_indicies = np.random.choice(
            np.arange(num_features), 
            num_used_features, 
            replace=False
        )
        self.used_features_mask = one_hot[sampled_feature_indicies]

        # Initialize the weights of the classes in leaves.
        self.pi = tf.Variable(
            initial_value=tf.random_normal_initializer()(
                shape=[self.num_leaves, self.num_classes]
            ),
            dtype="float32",
            trainable=True,
        )

        # Initialize the stochastic routing layer.
        self.decision_fn = layers.Dense(
            units=self.num_leaves, activation="sigmoid", name="decision"
        )

    def call(self, features):
        batch_size = tf.shape(features)[0]

        # Apply the feature mask to the input features.
        features = tf.matmul(
            features, self.used_features_mask, transpose_b=True
        )
        
        # Compute the routing probabilities.
        decisions = tf.expand_dims(
            self.decision_fn(features), axis=2
        )

        # Concatenate the routing probabilities with their complements.
        decisions = layers.concatenate(
            [decisions, 1 - decisions], axis=2
        )

        mu = tf.ones([batch_size, 1, 1])
        begin_idx = 1
        end_idx = 2
        
        # Traverse the tree in breadth-first order.
        for level in range(self.depth):
            mu = tf.reshape(mu, [batch_size, -1, 1])
            mu = tf.tile(mu, (1, 1, 2))
            level_decisions = decisions[
                :, begin_idx:end_idx, :
            ]
            mu = mu * level_decisions
            begin_idx = end_idx
            end_idx = begin_idx + 2 ** (level + 1)

        mu = tf.reshape(mu, [batch_size, self.num_leaves])
        probabilities = keras.activations.softmax(self.pi)
        outputs = tf.matmul(mu, probabilities)
        return outputs

---
# Deep Neural Decision Forest

In [23]:
# NDF class object
class NeuralDecisionForest(keras.Model):
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionForest, self).__init__()
        self.ensemble = []

        # Initialize the ensemble by adding NeuralDecisionTree instances.
        # Each tree will have its own randomly selected input features to use.
        for _ in range(num_trees):
            self.ensemble.append(
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
            )

    def call(self, inputs):

        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
        batch_size = tf.shape(inputs)[0]
        outputs = tf.zeros([batch_size, num_classes])

        # Aggregate the outputs of trees in the ensemble.
        for tree in self.ensemble:
            outputs += tree(inputs)
        
        # Divide the outputs by the ensemble size to get the average.
        outputs /= len(self.ensemble)
        return outputs

In [39]:
learning_rate = 0.01
batch_size = 265
num_epochs = 10
hidden_units = [64, 64]

# function to experiment models
def run_experiment(model):

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    print("Start training the model...")
    
    train_dataset = get_dataset_from_csv(
        "train_data.csv", 
        shuffle=True, 
        batch_size=batch_size
    )

    model.fit(train_dataset, epochs=num_epochs)
    
    print("Model training finished")
    print("Evaluating the model on the test data...")
    
    test_dataset = get_dataset_from_csv(
        "test_data.csv", 
        batch_size=batch_size
    )
    _, accuracy = model.evaluate(test_dataset)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

---
# Experiment 1: train a decision tree model

In [40]:
num_trees = 10
depth = 10
used_features_rate = 1.0
num_classes = len(TARGET_LABELS)

def create_tree_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]
    tree = NeuralDecisionTree(
        depth, num_features, used_features_rate, num_classes
    )
    outputs = tree(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

tree_model = create_tree_model()
run_experiment(tree_model)

Start training the model...
Epoch 1/10


  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model training finished
Evaluating the model on the test data...
Test accuracy: 84.73%


---
# Experiment 2: train a forest model

In [41]:
num_trees = 25
depth = 5
used_features_rate = 0.5

def create_forest_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]
    forest_model = NeuralDecisionForest(
        num_trees, depth, num_features, used_features_rate, num_classes
    )
    outputs = forest_model(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

forest_model = create_forest_model()
run_experiment(forest_model)

Start training the model...
Epoch 1/10


  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model training finished
Evaluating the model on the test data...
Test accuracy: 85.69%
