# Spaceship Titanic Kaggle Competition

In [189]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from keras import layers

### Read data

In [194]:
# read raw data

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [195]:
# filter columns

train_df = train_df.drop(columns=["PassengerId", "Name"])

train_df[["Cabin_Deck", "Cabin_Num", "Cabin_Side"]] = train_df["Cabin"].str.split("/", expand=True)
train_df = train_df.drop(columns="Cabin")

In [196]:
test_df = test_df.drop(columns=["Name"])
test_ids = test_df.pop("PassengerId")

test_df[["Cabin_Deck", "Cabin_Num", "Cabin_Side"]] = test_df["Cabin"].str.split("/", expand=True)
test_df = test_df.drop(columns="Cabin")

In [197]:
# fill missing values

def filter_missing(df):
    for column in df.columns:

        dtype = df[column].dtype
        
        if dtype == "object":
            df[column] = df[column].fillna(value="0")
        elif dtype == "float64":
            df[column] = df[column].fillna(value=0)
        elif dtype == "bool":
            df[column] = df[column].fillna(value=False)
    
    return df

In [198]:
train_df = filter_missing(train_df)
test_df = filter_missing(test_df)

In [199]:
# convert booleans

train_df[["CryoSleep", "VIP"]] = train_df[
    ["CryoSleep", "VIP"]].astype(str)

test_df[["CryoSleep", "VIP"]] = test_df[
    ["CryoSleep", "VIP"]].astype(str)

train_df["Transported"] = train_df["Transported"].astype(int)

In [200]:
# split into training and validation datasets

val_frac = .2
val_df = train_df.sample(frac=0.2)
train_df = train_df.drop(val_df.index)

In [201]:
# convert pd DataFrame to tf Dataset labelled with answers

def dataframe_to_dataset(df):
    
    df = df.copy()
    labels = df.pop("Transported").astype(int)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))

    return ds

In [202]:
# convert and batch train and val datasets 

train_ds = dataframe_to_dataset(train_df)
val_ds = dataframe_to_dataset(val_df)

batch_size = 32
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

In [203]:
# convert test df
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_df))

### Encode input features

In [204]:
from tensorflow.keras.layers import StringLookup

def encode_categorical_feature(feature, name, feature_ds):

    lookup = StringLookup(output_mode="one_hot")
    lookup.adapt(feature_ds)

    return lookup(feature)

In [205]:
from tensorflow.keras.layers import Normalization

def encode_float_feature(feature, name, feature_ds):

    normalizer = Normalization()
    normalizer.adapt(feature_ds)
    
    return normalizer(feature)

In [206]:
def encode_feature(feature, name, dataset):

    # construct dataset with only given feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # build encoder based on feature dtype
    dtype = feature_ds.element_spec.dtype

    if dtype == tf.string:
        encoded_feature = encode_categorical_feature(feature, name, feature_ds)
    elif dtype == tf.float64:
        encoded_feature = encode_float_feature(feature, name, feature_ds)
    else:
        print("Unexpected datatype: " + str(dtype))
        encoded_feature = "Pain"

    return encoded_feature

### Build model from encoded features

In [207]:
# build input layers based on feature specs from ds

all_inputs = dict()

features = train_ds.element_spec[0]

for name, spec in features.items():
    all_inputs[name] = keras.Input(
        shape=(1,),
        name=name,
        dtype=spec.dtype
    )

In [208]:
# build individual encoding layers for each feature

encodings = dict()

for name, spec in features.items():
    encodings[name] = encode_feature(
        all_inputs[name],
        name,
        train_ds
    )

2023-07-05 17:22:42.701852: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_7' with dtype string and shape [6954]
	 [[{{node Placeholder/_7}}]]
2023-07-05 17:22:42.702849: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype double and shape [6954]
	 [[{{node Placeholder/_6}}]]
2023-07-05 17:22:43.051003: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype double and shape [6954

In [209]:
# build dense network from encoded features to predicted survival

all_features = layers.concatenate(encodings.values())

x = layers.Dense(128, activation="relu")(all_features)
x = layers.Dropout(0.2)(x)

x = layers.Dense(32, activation="relu")(x)

inputs = list(all_inputs.values())
output = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(list(all_inputs.values()), output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [210]:
# keras.utils.plot_model(model, rankdir="LR")

In [211]:
# train model

model.fit(train_ds, epochs=10)

Epoch 1/10


2023-07-05 17:22:52.939703: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_5' with dtype string and shape [6954]
	 [[{{node Placeholder/_5}}]]
2023-07-05 17:22:52.940680: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype string and shape [6954]
	 [[{{node Placeholder/_4}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe13e6180d0>

### Make predictions for test data

In [220]:
predictions = model.predict(dict(test_df))



In [223]:
labelled_predictions = pd.DataFrame()
labelled_predictions["PassengerId"] = test_ids
labelled_predictions["Transported"] = [round(pred[0]) for pred in predictions]

In [224]:
labelled_predictions.to_csv("../submissions/first_submission.csv", index=False)