In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from keras import layers

In [2]:
train_data_all = pd.read_csv("../data/train.csv")
test_data_all = pd.read_csv("../data/test.csv")

In [3]:
# replace missing values in training data

def filter_df(df):

    df_filtered = pd.DataFrame()

    for column in df.columns:
        dtype = df[column].dtype

        # use -1 for numerical data, "NA" for strings (objects)
        if dtype == "int64" or dtype == "float64":
            val = -1
        else:
            val = "NA"
        df[column] = df[column].fillna(val)

    # ignore name, and ticket (for now)
    df_filtered = df.drop(columns=["Name", "Ticket"])

    # convert age to float for treatment as continuous variable
    df_filtered["Age"] = df_filtered["Age"].astype('float64')

    return df_filtered

In [4]:
train_data_filtered = filter_df(train_data_all).drop(columns="PassengerId")

test_data_filtered = filter_df(test_data_all)
test_ids = test_data_filtered.pop("PassengerId")

In [5]:
# split into training and validation datasets

val_frac = .2
val_df = train_data_filtered.sample(frac=0.2)
train_df = train_data_filtered.drop(val_df.index)

In [6]:
# convert pd DataFrame to tf Dataset labelled with answers
def dataframe_to_dataset(df):
    
    df = df.copy()
    labels = df.pop("Survived")
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))

    return ds

In [7]:
# convert to Datasets
train_ds = dataframe_to_dataset(train_df)
val_ds = dataframe_to_dataset(val_df)

# batch Datasets
batch_size = 32
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

In [8]:
# convert test data
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_data_filtered))

In [9]:
from tensorflow.keras.layers import StringLookup

def encode_string_feature(feature, name, feature_ds):

    lookup = StringLookup(output_mode="one_hot")
    lookup.adapt(feature_ds)

    return lookup(feature)

In [10]:
from tensorflow.keras.layers import IntegerLookup

def encode_integer_feature(feature, name, feature_ds):

    lookup = IntegerLookup(output_mode="one_hot")
    lookup.adapt(feature_ds)

    return lookup(feature)

In [11]:
from tensorflow.keras.layers import Normalization

def encode_float_feature(feature, name, feature_ds):

    normalizer = Normalization()
    normalizer.adapt(feature_ds)
    
    return normalizer(feature)

In [12]:
def encode_feature(feature, name, dataset):

    # construct dataset with only given feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # build encoder based on feature dtype
    dtype = feature_ds.element_spec.dtype

    if dtype == tf.string:
        encoded_feature = encode_string_feature(feature, name, feature_ds)
    elif dtype == tf.int64:
        encoded_feature = encode_integer_feature(feature, name, feature_ds)
    elif dtype == tf.float64:
        encoded_feature = encode_float_feature(feature, name, feature_ds)
    else:
        print("Unexpected datatype: " + str(dtype))
        encoded_feature = "AAAAaaaAA"

    return encoded_feature


In [13]:
# build input layers based on feature specs from ds

all_inputs = dict()

features = train_ds.element_spec[0]

for name, spec in features.items():
    all_inputs[name] = keras.Input(
        shape=(1,),
        name=name,
        dtype=spec.dtype
    )

In [14]:
# build individual encoding layers for each feature

encodings = dict()

for name, spec in features.items():
    encodings[name] = encode_feature(
        all_inputs[name],
        name,
        train_ds
    )

# concatenate feature encodings

all_features = layers.concatenate(encodings.values())

2024-03-07 16:35:41.139577: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [15]:
# build dense network from encoded features to predicted survival

x = layers.Dense(128, activation="relu")(all_features)
x = layers.Dropout(0.2)(x)

x = layers.Dense(32, activation="relu")(x)

inputs = list(all_inputs.values())
output = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [16]:
# train model
model.fit(train_ds, epochs=3)

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17ef0ab60>

In [21]:
# Create callback to save the best model
best_model_path = "best_model.h5"
best_model_callback = tf.keras.callbacks.ModelCheckpoint(best_model_path, monitor='val_accuracy', save_best_only=True, mode='max')

# Train the model
history = model.fit(train_ds, epochs=10, validation_data=val_ds, callbacks=[best_model_callback])

# Load the best model
best_model = tf.keras.models.load_model(best_model_path)

Epoch 1/10
 1/23 [>.............................] - ETA: 0s - loss: 0.5726 - accuracy: 0.6875

NotImplementedError: Save or restore weights that is not an instance of `tf.Variable` is not supported in h5, use `save_format='tf'` instead. Received a model or layer IntegerLookup with weights [<keras.layers.preprocessing.index_lookup.VocabWeightHandler object at 0x17ef0b970>]

In [22]:
history = model.fit(train_ds, epochs=10, validation_data=val_ds)

# Get the epoch with the best validation accuracy
best_epoch = np.argmax(history.history['val_accuracy']) + 1

# Save the model with the best validation accuracy
model.save('best_model_epoch_{}'.format(best_epoch))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: best_model_epoch_8/assets


INFO:tensorflow:Assets written to: best_model_epoch_8/assets


In [17]:
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_data_filtered))

In [18]:
predictions = model.predict(dict(test_data_filtered))



In [19]:
labelled_predictions = pd.DataFrame()
labelled_predictions["PassengerId"] = test_ids
labelled_predictions["Survived"] = [round(pred[0]) for pred in predictions]

In [20]:
labelled_predictions.to_csv("../submissions/deep_submission.csv", index=False)