In [30]:
import tensorflow as tf
import pandas as pd

from IPython.display import clear_output

## Load the titanic dataset

In [2]:
train_df = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
eval_df = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
train_y = train_df.pop('survived')
eval_y = eval_df.pop('survived')

In [3]:
tf.random.set_seed(123)

## Create feature columns and input functions

__NOTE__:

`categorical_column_with_vocabulary_list` is used to transform `string` to `numeric` that represents different unique values. E.g. ['cat', 'dog', ...] to [1, 2, ...]

`indicator_column` is used to transform the `numberic` values to one-hot encoding `Dense` values.

In [4]:
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
                       'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']

def one_hot_cat_columns(feature_name, vocab):
    return tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key=feature_name, vocabulary_list=vocab
        )
    )

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocab = train_df[feature_name].unique()
    feature_columns.append(one_hot_cat_columns(feature_name, vocab))
    
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [27]:
# View the transformation
example = dict(train_df.head(1))
class_fc = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('class', ('First', 'Second', 'Third')))
print('Feature value: "{}"'.format(example['class'].iloc[0]))
print('One-hot encoded: ', tf.keras.layers.DenseFeatures([class_fc])(example).numpy())

Feature value: "Third"
One-hot encoded:  [[0. 0. 1.]]


In [29]:
NUM_EXAMPLES = len(train_y)

def make_input_fn(X, y, num_epochs=None, shuffle=True):
    
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((X.to_dict('list'), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)
        
        # For training, cycle the through dataset as many times as need (n_epoch=None).
        dataset = dataset.repeat(num_epochs)
        # In memory training doesn't batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
        
    return input_fn

train_input_fn = make_input_fn(train_df, train_y)
eval_input_fn = make_input_fn(eval_df, eval_y, num_epochs=1, shuffle=False)

## Train and evaluate the model

In [35]:
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(
    feature_columns=feature_columns,
    n_batches_per_layer=n_batches,
    n_classes=2
)
clear_output()

In [36]:
est.train(input_fn=train_input_fn, max_steps=100)

result = est.evaluate(input_fn=eval_input_fn)
clear_output()
result

{'accuracy': 0.8181818,
 'accuracy_baseline': 0.625,
 'auc': 0.86917657,
 'auc_precision_recall': 0.8533196,
 'average_loss': 0.41656974,
 'label/mean': 0.375,
 'loss': 0.41656974,
 'precision': 0.7802198,
 'prediction/mean': 0.3715717,
 'recall': 0.7171717,
 'global_step': 100}