In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from keras import layers

2023-07-13 14:28:30.939310: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-13 14:28:31.053442: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-13 14:28:31.054406: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_data_all = pd.read_csv("../data/train.csv")
test_data_all = pd.read_csv("../data/test.csv")

In [3]:
# replace missing values in training data

def filter_df(df):

    df_filtered = pd.DataFrame()

    for column in df.columns:
        dtype = df[column].dtype

        # use -1 for numerical data, "NA" for strings (objects)
        if dtype == "int64" or dtype == "float64":
            val = -1
        else:
            val = "NA"
        df[column] = df[column].fillna(val)

    # ignore name, and ticket (for now)
    df_filtered = df.drop(columns=["Name", "Ticket"])

    # convert age to float for treatment as continuous variable
    df_filtered["Age"] = df_filtered["Age"].astype('float64')

    return df_filtered

In [4]:
train_data_filtered = filter_df(train_data_all).drop(columns="PassengerId")

test_data_filtered = filter_df(test_data_all)
test_ids = test_data_filtered.pop("PassengerId")

In [5]:
# split into training and validation datasets

val_frac = .2
val_df = train_data_filtered.sample(frac=0.2)
train_df = train_data_filtered.drop(val_df.index)

In [6]:
# convert pd DataFrame to tf Dataset labelled with answers
def dataframe_to_dataset(df):
    
    df = df.copy()
    labels = df.pop("Survived")
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))

    return ds

In [7]:
# convert to Datasets
train_ds = dataframe_to_dataset(train_df)
val_ds = dataframe_to_dataset(val_df)

# batch Datasets
batch_size = 32
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

2023-07-13 14:28:34.268357: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-13 14:28:34.345082: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
# convert test data
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_data_filtered))

In [9]:
from tensorflow.keras.layers import StringLookup

def encode_string_feature(feature, name, feature_ds):

    lookup = StringLookup(output_mode="one_hot")
    lookup.adapt(feature_ds)

    return lookup(feature)

In [10]:
from tensorflow.keras.layers import IntegerLookup

def encode_integer_feature(feature, name, feature_ds):

    lookup = IntegerLookup(output_mode="one_hot")
    lookup.adapt(feature_ds)

    return lookup(feature)

In [11]:
from tensorflow.keras.layers import Normalization

def encode_float_feature(feature, name, feature_ds):

    normalizer = Normalization()
    normalizer.adapt(feature_ds)
    
    return normalizer(feature)

In [12]:
def encode_feature(feature, name, dataset):

    # construct dataset with only given feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # build encoder based on feature dtype
    dtype = feature_ds.element_spec.dtype

    if dtype == tf.string:
        encoded_feature = encode_string_feature(feature, name, feature_ds)
    elif dtype == tf.int64:
        encoded_feature = encode_integer_feature(feature, name, feature_ds)
    elif dtype == tf.float64:
        encoded_feature = encode_float_feature(feature, name, feature_ds)
    else:
        print("Unexpected datatype: " + str(dtype))
        encoded_feature = "AAAAaaaAA"

    return encoded_feature


In [13]:
# build input layers based on feature specs from ds

all_inputs = dict()

features = train_ds.element_spec[0]

for name, spec in features.items():
    all_inputs[name] = keras.Input(
        shape=(1,),
        name=name,
        dtype=spec.dtype
    )

In [14]:
# build individual encoding layers for each feature

encodings = dict()

for name, spec in features.items():
    encodings[name] = encode_feature(
        all_inputs[name],
        name,
        train_ds
    )

# concatenate feature encodings

all_features = layers.concatenate(encodings.values())

2023-07-13 14:28:43.494890: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [713]
	 [[{{node Placeholder/_0}}]]
2023-07-13 14:28:43.495264: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_5' with dtype int64 and shape [713]
	 [[{{node Placeholder/_5}}]]
2023-07-13 14:28:43.616405: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype int64 and shape [713]
	 [

In [15]:
# build dense network from encoded features to predicted survival

x = layers.Dense(128, activation="relu")(all_features)
x = layers.Dropout(0.2)(x)

x = layers.Dense(32, activation="relu")(x)

inputs = list(all_inputs.values())
output = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(list(all_inputs.values()), output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [16]:
# train model
model.fit(train_ds, epochs=50)

Epoch 1/50


2023-07-13 14:28:46.427272: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype string and shape [713]
	 [[{{node Placeholder/_6}}]]
2023-07-13 14:28:46.427756: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [713]
	 [[{{node Placeholder/_3}}]]


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f88c0640c90>

In [46]:
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_data_filtered))

In [47]:
predictions = model.predict(dict(test_data_filtered))



In [49]:
labelled_predictions = pd.DataFrame()
labelled_predictions["PassengerId"] = test_ids
labelled_predictions["Survived"] = [round(pred[0]) for pred in predictions]

In [88]:
labelled_predictions.to_csv("../submissions/deep_submission.csv", index=False)