In [7]:
import numpy as np
import tensorflow as tf
#!pip install tensorflow_datasets
import tensorflow_datasets as tfds

In [8]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

In [9]:
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

In [10]:
!head {train_file_path}

'head' is not recognized as an internal or external command,
operable program or batch file.


In [11]:
# CSV columns in the input file.
with open(train_file_path, 'r') as f:
    names_row = f.readline()


CSV_COLUMNS = names_row.rstrip('\n').split(',')
print(CSV_COLUMNS)

['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']


In [13]:
LABELS = [0, 1]
LABEL_COLUMN = 'survived'

FEATURE_COLUMNS = [column for column in CSV_COLUMNS if column != LABEL_COLUMN]

In [44]:
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=12, # Artificially small to make examples easier to show.
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True)
    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [45]:
examples, labels = next(iter(raw_train_data)) # Just the first batch.
print("EXAMPLES: \n", examples, "\n")
print("LABELS: \n", labels)

EXAMPLES: 
 OrderedDict([('sex', <tf.Tensor: id=14580, shape=(12,), dtype=string, numpy=
array([b'male', b'male', b'female', b'male', b'female', b'female',
       b'male', b'female', b'female', b'male', b'male', b'male'],
      dtype=object)>), ('age', <tf.Tensor: id=14572, shape=(12,), dtype=float32, numpy=
array([28., 35., 28., 46., 41.,  9., 28., 22., 28., 28., 28., 37.],
      dtype=float32)>), ('n_siblings_spouses', <tf.Tensor: id=14578, shape=(12,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 2])>), ('parch', <tf.Tensor: id=14579, shape=(12,), dtype=int32, numpy=array([0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 0, 0])>), ('fare', <tf.Tensor: id=14577, shape=(12,), dtype=float32, numpy=
array([ 7.896,  7.05 ,  7.229, 79.2  , 20.212, 31.275, 15.246, 49.5  ,
        7.75 ,  7.896,  8.05 ,  7.925], dtype=float32)>), ('class', <tf.Tensor: id=14574, shape=(12,), dtype=string, numpy=
array([b'Third', b'Third', b'Third', b'First', b'Third', b'Third',
       b'Third', b'First', b'Third'

In [46]:
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}

In [47]:
def process_categorical_data(data, categories):
    """Returns a one-hot encoded tensor representing categorical values."""
  
    # Remove leading ' '.
    data = tf.strings.regex_replace(data, '^ ', '')
    # Remove trailing '.'.
    data = tf.strings.regex_replace(data, r'\.$', '')
  
    # ONE HOT ENCODE
    # Reshape data from 1d (a list) to a 2d (a list of one-element lists)
    data = tf.reshape(data, [-1, 1])
    # For each element, create a new list of boolean values the length of categories,
    # where the truth value is element == category label
    data = tf.equal(categories, data)
    # Cast booleans to floats.
    data = tf.cast(data, tf.float32)
  
    # The entire encoding can fit on one line:
    # data = tf.cast(tf.equal(categories, tf.reshape(data, [-1, 1])), tf.float32)
    return data

In [48]:
class_tensor = examples['class']
class_tensor

<tf.Tensor: id=14574, shape=(12,), dtype=string, numpy=
array([b'Third', b'Third', b'Third', b'First', b'Third', b'Third',
       b'Third', b'First', b'Third', b'Third', b'Third', b'Third'],
      dtype=object)>

In [49]:

class_categories = CATEGORIES['class']
class_categories

['First', 'Second', 'Third']

In [50]:
processed_class = process_categorical_data(class_tensor, class_categories)
processed_class

<tf.Tensor: id=14599, shape=(12, 3), dtype=float32, numpy=
array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)>

In [51]:
print("Size of batch: ", len(class_tensor.numpy()))
print("Number of category labels: ", len(class_categories))
print("Shape of one-hot encoded tensor: ", processed_class.shape)

Size of batch:  12
Number of category labels:  3
Shape of one-hot encoded tensor:  (12, 3)


In [52]:
def process_continuous_data(data, mean):
    # Normalize data
    data = tf.cast(data, tf.float32) * 1/(2*mean)
    return tf.reshape(data, [-1, 1])

In [53]:
MEANS = {
    'age' : 29.631308,
    'n_siblings_spouses' : 0.545455,
    'parch' : 0.379585,
    'fare' : 34.385399
}

In [54]:
age_tensor = examples['age']
age_tensor

<tf.Tensor: id=14572, shape=(12,), dtype=float32, numpy=
array([28., 35., 28., 46., 41.,  9., 28., 22., 28., 28., 28., 37.],
      dtype=float32)>

In [55]:
process_continuous_data(age_tensor, MEANS['age'])

<tf.Tensor: id=14608, shape=(12, 1), dtype=float32, numpy=
array([[0.472],
       [0.591],
       [0.472],
       [0.776],
       [0.692],
       [0.152],
       [0.472],
       [0.371],
       [0.472],
       [0.472],
       [0.472],
       [0.624]], dtype=float32)>

In [57]:
def preprocess(features, labels):
  
    # Process categorial features.
    for feature in CATEGORIES.keys():
        features[feature] = process_categorical_data(features[feature],
                                                 CATEGORIES[feature])

    # Process continuous features.
    for feature in MEANS.keys():
        features[feature] = process_continuous_data(features[feature],
                                                MEANS[feature])
  
    # Assemble features into a single tensor.
    features = tf.concat([features[column] for column in FEATURE_COLUMNS], 1)
  
    return features, labels

In [58]:
train_data = raw_train_data.map(preprocess).shuffle(500)
test_data = raw_test_data.map(preprocess)

In [59]:
examples, labels = next(iter(train_data))

examples, labels

(<tf.Tensor: id=14775, shape=(12, 24), dtype=float32, numpy=
 array([[1.   , 0.   , 0.81 , 0.   , 0.   , 0.386, 1.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 1.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 1.   , 0.   ],
        [1.   , 0.   , 0.405, 0.   , 0.   , 0.189, 0.   , 1.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 1.   , 0.   ],
        [1.   , 0.   , 0.472, 0.   , 0.   , 0.123, 0.   , 0.   , 1.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 1.   , 1.   , 0.   ],
        [0.   , 1.   , 0.692, 0.   , 6.586, 0.577, 0.   , 0.   , 1.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 1.   ],
        [1.   , 0.   , 0.27 , 0.   , 0.   , 0.117, 0.   , 0.   , 1.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.

In [61]:
def get_model(input_dim, hidden_units=[100]):
    """Create a Keras model with layers.

  Args:
    input_dim: (int) The shape of an item in a batch. 
    labels_dim: (int) The shape of a label.
    hidden_units: [int] the layer sizes of the DNN (input layer first)
    learning_rate: (float) the learning rate for the optimizer.

  Returns:
    A Keras model.
    """
    inputs = tf.keras.Input(shape=(input_dim,))
    x = inputs

    for units in hidden_units:
        x = tf.keras.layers.Dense(units, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs, outputs)
 
    return model

In [62]:
input_shape, output_shape = train_data.output_shapes

input_dimension = input_shape.dims[1] # [0] is the batch size

In [63]:
model = get_model(input_dimension)
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

model.fit(train_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1fd1dfe3dd8>

In [64]:
test_loss, test_accuracy = model.evaluate(test_data)
print('Test Loss {0}, Test Accuracy {1}\n'.format(test_loss, test_accuracy))

     22/Unknown - 0s 133ms/step - loss: 0.7832 - accuracy: 0.583 - 0s 70ms/step - loss: 0.6111 - accuracy: 0.666 - 0s 50ms/step - loss: 0.5632 - accuracy: 0.72 - 0s 39ms/step - loss: 0.5418 - accuracy: 0.70 - 0s 33ms/step - loss: 0.5353 - accuracy: 0.71 - 0s 29ms/step - loss: 0.5366 - accuracy: 0.70 - 0s 26ms/step - loss: 0.4842 - accuracy: 0.75 - 0s 24ms/step - loss: 0.4696 - accuracy: 0.75 - 0s 22ms/step - loss: 0.4918 - accuracy: 0.73 - 0s 21ms/step - loss: 0.4808 - accuracy: 0.74 - 0s 20ms/step - loss: 0.4635 - accuracy: 0.75 - 0s 19ms/step - loss: 0.4848 - accuracy: 0.75 - 0s 18ms/step - loss: 0.4870 - accuracy: 0.74 - 0s 17ms/step - loss: 0.4804 - accuracy: 0.74 - 0s 17ms/step - loss: 0.4710 - accuracy: 0.75 - 0s 16ms/step - loss: 0.4771 - accuracy: 0.76 - 0s 16ms/step - loss: 0.4687 - accuracy: 0.75 - 0s 16ms/step - loss: 0.4636 - accuracy: 0.76 - 0s 15ms/step - loss: 0.4563 - accuracy: 0.77 - 0s 15ms/step - loss: 0.4508 - accuracy: 0.77 - 0s 15ms/step - loss: 0.4454 - accuracy:

In [65]:
predictions = model.predict(test_data)

# Show some results
for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
  print("Predicted survival: {:.2%}".format(prediction[0]),
        " | Actual outcome: ",
        ("SURVIVED" if bool(survived) else "DIED"))


Predicted survival: 67.53%  | Actual outcome:  DIED
Predicted survival: 37.30%  | Actual outcome:  DIED
Predicted survival: 99.11%  | Actual outcome:  SURVIVED
Predicted survival: 79.45%  | Actual outcome:  DIED
Predicted survival: 13.62%  | Actual outcome:  DIED
Predicted survival: 80.49%  | Actual outcome:  DIED
Predicted survival: 32.00%  | Actual outcome:  SURVIVED
Predicted survival: 13.41%  | Actual outcome:  DIED
Predicted survival: 13.02%  | Actual outcome:  DIED
Predicted survival: 58.34%  | Actual outcome:  SURVIVED
