# Spaceship Titanic Kaggle Competition

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
import keras_tuner

from tensorflow import keras
from keras import layers

2023-07-11 16:18:44.894219: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-11 16:18:44.942853: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-11 16:18:44.943537: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Read data

In [48]:
# read raw data

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [49]:
# filter columns

train_df = train_df.drop(columns=["PassengerId", "Name"])

train_df[["Cabin_Deck", "Cabin_Num", "Cabin_Side"]] = train_df["Cabin"].str.split("/", expand=True)
train_df = train_df.drop(columns="Cabin")

In [50]:
test_df = test_df.drop(columns=["Name"])
test_ids = test_df.pop("PassengerId")

test_df[["Cabin_Deck", "Cabin_Num", "Cabin_Side"]] = test_df["Cabin"].str.split("/", expand=True)
test_df = test_df.drop(columns="Cabin")

In [51]:
# encode cabin number as float so it's treated numerically
train_df["Cabin_Num"] = train_df["Cabin_Num"].astype("float64")
test_df["Cabin_Num"] = test_df["Cabin_Num"].astype("float64")

In [52]:
# fill missing values

def filter_missing(df):
    for column in df.columns:

        dtype = df[column].dtype
        
        if dtype == "object":
            df[column] = df[column].fillna(value="0")
        elif dtype == "float64":
            df[column] = df[column].fillna(value=0.0)
        elif dtype == "bool":
            df[column] = df[column].fillna(value=False)
    
    return df

In [53]:
train_df = filter_missing(train_df)
test_df = filter_missing(test_df)

In [54]:
# convert booleans

train_df[["CryoSleep", "VIP"]] = train_df[
    ["CryoSleep", "VIP"]].astype(str)

test_df[["CryoSleep", "VIP"]] = test_df[
    ["CryoSleep", "VIP"]].astype(str)

train_df["Transported"] = train_df["Transported"].astype(int)

In [55]:
# split into training and validation datasets

val_frac = .2
val_df = train_df.sample(frac=0.2)
train_df = train_df.drop(val_df.index)

In [56]:
# save dataframes

train_df.to_pickle("../data/train_df.pkl")
val_df.to_pickle("../data/val_df.pkl")
test_df.to_pickle("../data/test_df.pkl")

In [57]:
# load dataframes
train_df = pd.read_pickle("../data/train_df.pkl")
val_df = pd.read_pickle("../data/val_df.pkl")
test_df = pd.read_pickle("../data/test_df.pkl")

In [58]:
# convert pd DataFrame to tf Dataset labelled with answers

def dataframe_to_dataset(df):
    
    df = df.copy()
    labels = df.pop("Transported").astype(int)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))

    return ds

In [59]:
# convert and batch train and val datasets 

train_ds = dataframe_to_dataset(train_df)
val_ds = dataframe_to_dataset(val_df)

batch_size = 32
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

In [60]:
# convert test df
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_df))

### Encode input features

In [6]:
from tensorflow.keras.layers import StringLookup

def encode_categorical_feature(feature, name, feature_ds):

    lookup = StringLookup(output_mode="one_hot")
    lookup.adapt(feature_ds)

    return lookup(feature)

In [7]:
from tensorflow.keras.layers import Normalization

def encode_float_feature(feature, name, feature_ds):

    normalizer = Normalization()
    normalizer.adapt(feature_ds)
    
    return normalizer(feature)

In [8]:
def encode_feature(feature, name, dataset):

    # construct dataset with only given feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # build encoder based on feature dtype
    dtype = feature_ds.element_spec.dtype

    if dtype == tf.string:
        encoded_feature = encode_categorical_feature(feature, name, feature_ds)
    elif dtype == tf.float64:
        encoded_feature = encode_float_feature(feature, name, feature_ds)
    else:
        print("Unexpected datatype: " + str(dtype))
        encoded_feature = "Pain"

    return encoded_feature

### Build model from encoded features

In [9]:
# build input layers based on feature specs from ds

all_inputs = dict()

features = train_ds.element_spec[0]

for name, spec in features.items():
    all_inputs[name] = keras.Input(
        shape=(1,),
        name=name,
        dtype=spec.dtype
    )

In [10]:
# build individual encoding layers for each feature

encodings = dict()

for name, spec in features.items():
    encodings[name] = encode_feature(
        all_inputs[name],
        name,
        train_ds
    )

2023-07-11 16:19:00.649242: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype double and shape [6954]
	 [[{{node Placeholder/_2}}]]
2023-07-11 16:19:00.649669: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype double and shape [6954]
	 [[{{node Placeholder/_6}}]]
2023-07-11 16:19:00.952667: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_7' with dtype string and shape [6954

In [40]:
# build dense network with tunable hyperparameters

def build_model(hp):

    x = layers.concatenate(encodings.values())

    x = layers.Dropout(rate=.1)(x)

    for i in range(hp.Int("num_layers", 1, 3)):
        x = layers.Dense(
            units=hp.Int(f"units_{i}", min_value=16, max_value=256, step=32),
            activation="relu"
        )(x)

        x = layers.Dropout(rate=.4)(x)

    inputs = list(all_inputs.values())
    output = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs, output)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=.005),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model

### Tune hyperparameters

In [41]:
# build hyperparameter tuner
#   searches for number of dense layers (1-3)
#   and units per layer (16-256)

tuner = keras_tuner.BayesianOptimization(
    hypermodel=build_model,
    objective="val_accuracy",
    max_trials=25,
    executions_per_trial=1,
    overwrite=True,
    directory="hyperparameters",
    project_name="spaceship-titanic"
)

In [43]:
# automatically optimize hyperparameters

tuner.search(train_ds,
             validation_data=val_ds,
             epochs=5,
             callbacks=[keras.callbacks.TensorBoard("tb_logs")]
             )

Trial 25 Complete [00h 00m 06s]
val_accuracy: 0.8004600405693054

Best val_accuracy So Far: 0.8004600405693054
Total elapsed time: 00h 02m 48s
INFO:tensorflow:Oracle triggered exit


In [44]:
tuner.results_summary()

Results summary
Results in hyperparameters/spaceship-titanic
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 02 summary
Hyperparameters:
num_layers: 2
units_0: 208
units_1: 176
units_2: 112
Score: 0.8004600405693054

Trial 24 summary
Hyperparameters:
num_layers: 2
units_0: 112
units_1: 16
units_2: 208
Score: 0.8004600405693054

Trial 16 summary
Hyperparameters:
num_layers: 2
units_0: 112
units_1: 16
units_2: 208
Score: 0.7998849749565125

Trial 07 summary
Hyperparameters:
num_layers: 2
units_0: 80
units_1: 208
units_2: 16
Score: 0.7975848317146301

Trial 18 summary
Hyperparameters:
num_layers: 2
units_0: 176
units_1: 144
units_2: 112
Score: 0.7975848317146301

Trial 23 summary
Hyperparameters:
num_layers: 2
units_0: 112
units_1: 16
units_2: 208
Score: 0.7970097661018372

Trial 03 summary
Hyperparameters:
num_layers: 3
units_0: 208
units_1: 144
units_2: 208
Score: 0.796434760093689

Trial 09 summary
Hyperparameters:
num_layers: 1
units_0: 176
units_1: 16
un

### Train model using best hyperparameters on full dataset

In [45]:
best_hps = tuner.get_best_hyperparameters(1)[0]
model = build_model(best_hps)

full_train_ds = train_ds.concatenate(val_ds)
model.fit(full_train_ds, epochs=10)

Epoch 1/10


2023-07-11 16:54:53.836566: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype double and shape [6954]
	 [[{{node Placeholder/_9}}]]
2023-07-11 16:54:53.837468: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_31' with dtype double and shape [1739]
	 [[{{node Placeholder/_31}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd2d06d58d0>

### Make predictions for test data

In [46]:
predictions = model.predict(dict(test_df))



In [63]:
labelled_predictions = pd.DataFrame()
labelled_predictions["PassengerId"] = test_ids
labelled_predictions["Transported"] = [bool(round(pred[0])) for pred in predictions]

In [64]:
labelled_predictions.to_csv("../submissions/tuned_single.csv", index=False)