In [1]:
import tensorflow as tf

In [2]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

In [3]:
train_filepath = tf.keras.utils.get_file('train.csv', TRAIN_DATA_URL)
test_filepath = tf.keras.utils.get_file('test.csv', TEST_DATA_URL)

## Load data

In [4]:
def get_dataset(filepath, **kwargs):
    return tf.data.experimental.make_csv_dataset(
        file_pattern=filepath,
        batch_size=32,
        label_name='survived',
        na_value='?', # Additional string to recognize as NA/NaN
        num_epochs=1, # used for repeat the dataset, leave it as 1 then specify epochs in model.fit
        ignore_errors=True,
        **kwargs
    )

In [5]:
raw_train_dataset = get_dataset(train_filepath)
raw_test_dataset = get_dataset(test_filepath)

In [6]:
## Illustrate other parameters in make_csv_dataset:

# Specify column names
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
temp_dataset = get_dataset(train_filepath, column_names=CSV_COLUMNS)

# Specify selected columns
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_filepath, select_columns=SELECT_COLUMNS, column_defaults=DEFAULTS)

## Data preprocessing

In [7]:
# The Csv dataset use dict to store features, like:
# <
# PrefetchDataset shapes:
#     (OrderedDict([
#         (age, (None,)), 
#         (n_siblings_spouses, (None,)), 
#         (parch, (None,)), 
#         (fare, (None,))
#     ]),
#      (None,)),
# types:
#     (OrderedDict([
#         (age, tf.float32), 
#         (n_siblings_spouses, tf.float32), 
#         (parch, tf.float32), 
#         (fare, tf.float32)]), 
#      tf.int32)
# >
# So we need to pack features together for each sample
def pack(features, label):
    return tf.stack(list(features.values()), axis=-1), label

### Continous data

If you have mixed datatypes you may want to separate out these simple-numeric fields. The `tf.feature_column` api can handle them, but this incurs some overhead and should be avoided unless really necessary.

So define a more general preprocessor that selects a list of numeric features and packs them into a single column:

In [8]:
class PackNumericFeatures:
    def __init__(self, names):
        self.names = names
    def __call__(self, features, labels):
        # NOTE: the features is a orderDict
        # Also, the features and labels are already batched
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features
        
        return features, labels

In [9]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_dataset.map(PackNumericFeatures(NUMERIC_FEATURES))
packed_test_data = raw_test_dataset.map(PackNumericFeatures(NUMERIC_FEATURES))

__Data Normalization__

In [10]:
import pandas as pd
desc = pd.read_csv(train_filepath)[NUMERIC_FEATURES].describe()
MEAN = desc.T['mean'].values
STD = desc.T['std'].values

In [11]:
def normalize_numeric_data(mean, std):
    def _normalize_numeric_data(data):
        return (data - mean) / std
    return _normalize_numeric_data

In [12]:
numeric_column = tf.feature_column.numeric_column(
    key='numeric', # A unique string identifying the input feature.
    shape=(len(NUMERIC_FEATURES),),
    normalizer_fn=normalize_numeric_data(MEAN, STD)
)
numeric_columns = [numeric_column]

In [13]:
# Cause the original dataset's features are OrdierDict
# so we first use feature_column to fetch the values by `key`
# then, we use DenseFeature layer to convert to Dense Tensor
# (DenseFeature will concatenate all the Dense Tensor)
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)

### Categorical data

Use the `tf.feature_column` API to create a collection with a `tf.feature_column.indicator_column` for each categorical column.

In [14]:
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}

In [15]:
categorical_columns = []
for feature_name, vocab in CATEGORIES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature_name,
        vocabulary_list=vocab
    )
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [16]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)

In [None]:
# The 20 means all the above one-hot encoding concatenate together
# The 32 is just batch size
categorical_layer(example_batch).shape

### Combined preprocessing layer

In [19]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns + numeric_columns)

## Build the model

In [20]:
model = tf.keras.Sequential([
    preprocessing_layer,
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)

## Train, evaluate, and predict

In [24]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [25]:
model.fit(train_data, epochs=1)

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


<tensorflow.python.keras.callbacks.History at 0x7f95901a3750>

In [137]:
model.inputs
# The numeric features normalization and categorical features preprocessing
# are done in preprocessing layer

{'alone': <tf.Tensor 'alone:0' shape=(None, 1) dtype=string>,
 'class': <tf.Tensor 'class:0' shape=(None, 1) dtype=string>,
 'deck': <tf.Tensor 'deck:0' shape=(None, 1) dtype=string>,
 'embark_town': <tf.Tensor 'embark_town:0' shape=(None, 1) dtype=string>,
 'numeric': <tf.Tensor 'numeric:0' shape=(None, 4) dtype=float32>,
 'sex': <tf.Tensor 'sex:0' shape=(None, 1) dtype=string>}