# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import L2
from tensorflow.keras.losses import BinaryCrossentropy
from keras.callbacks import Callback

2023-07-21 10:26:05.526577: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load data

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.shape

(8693, 14)

In [5]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

# Cleanup and transform the data


In [6]:
def transform_data(df, test_data=False):
    df.drop('Name', axis=1, inplace=True)
    if(test_data):
        df.dropna(subset=['Cabin', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age'], inplace=True)
    else:
        df.dropna(subset=['Transported', 'Cabin', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age'], inplace=True)
    # df['RoomService'].fillna(df['RoomService'].median(), inplace=True)
    # df['FoodCourt'].fillna(df['FoodCourt'].median(), inplace=True)
    # df['ShoppingMall'].fillna(df['ShoppingMall'].median(), inplace=True)
    # df['Spa'].fillna(df['Spa'].median(), inplace=True)
    # df['VRDeck'].fillna(df['VRDeck'].median(), inplace=True)
    # df['Age'].fillna(df['VRDeck'].mean(), inplace=True)
    # df['RoomService'].fillna(-1, inplace=True)
    # df['FoodCourt'].fillna(-1, inplace=True)
    # df['ShoppingMall'].fillna(-1, inplace=True)
    # df['Spa'].fillna(-1, inplace=True)
    # df['VRDeck'].fillna(-1, inplace=True)
    # df['Age'].fillna(-1, inplace=True)
    one_hot_home = pd.get_dummies(df['HomePlanet'], prefix='HomePlanet_').astype(int)
    df.drop('HomePlanet', axis=1, inplace=True)
    df = df.join(one_hot_home)
    one_hot_dest = pd.get_dummies(df['Destination'], prefix='Destination_').astype(int)
    df.drop('Destination', axis=1, inplace=True)
    df = df.join(one_hot_dest)
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df.drop('Cabin', axis=1, inplace=True)
    df['Num'] = pd.to_numeric(df['Num'], errors='coerce')
    one_hot_deck = pd.get_dummies(df['Deck'], prefix='Deck').astype(int)
    df.drop('Deck', axis=1, inplace=True)
    df = df.join(one_hot_deck)
    one_hot_side = pd.get_dummies(df['Side'], prefix='Side_').astype(int)
    df.drop('Side', axis=1, inplace=True)
    df = df.join(one_hot_side)
    one_hot_vip = pd.get_dummies(df['VIP'], prefix='VIP_').astype(int)
    df.drop('VIP', axis=1, inplace=True)
    df = df.join(one_hot_vip)
    one_hot_sleep = pd.get_dummies(df['CryoSleep'], prefix='CryoSleep_').astype(int)
    df.drop('CryoSleep', axis=1, inplace=True)
    df = df.join(one_hot_sleep)
    if(test_data == False):
        df['Transported'] = pd.to_numeric(df['Transported'], errors='coerce')
        col_transported = df['Transported'].astype(int)
        df.drop(columns='Transported', inplace=True)
        df.insert(0, 'Transported', col_transported)
    # Polynomials
    df['RoomService^2'] = pow(df['RoomService'], 2)
    df['RoomService^3'] = pow(df['RoomService'], 3)
    df['RoomService^4'] = pow(df['RoomService'], 4)
    df['RoomService^5'] = pow(df['RoomService'], 5)
    df['FoodCourt^2'] = pow(df['FoodCourt'], 2)
    df['FoodCourt^3'] = pow(df['FoodCourt'], 3)
    df['FoodCourt^4'] = pow(df['FoodCourt'], 4)
    df['FoodCourt^5'] = pow(df['FoodCourt'], 5)
    df['ShoppingMall^2'] = pow(df['ShoppingMall'], 2)
    df['ShoppingMall^3'] = pow(df['ShoppingMall'], 3)
    df['ShoppingMall^4'] = pow(df['ShoppingMall'], 4)
    df['ShoppingMall^5'] = pow(df['ShoppingMall'], 5)
    df['Spa^2'] = pow(df['Spa'], 2)
    df['Spa^3'] = pow(df['Spa'], 3)
    df['Spa^4'] = pow(df['Spa'], 4)
    df['Spa^5'] = pow(df['Spa'], 5)
    df['VRDeck^2'] = pow(df['VRDeck'], 2)
    df['VRDeck^3'] = pow(df['VRDeck'], 3)
    df['VRDeck^4'] = pow(df['VRDeck'], 4)
    df['VRDeck^5'] = pow(df['VRDeck'], 5)
    # Sum
    df['sum-1'] = df['VRDeck'] + df['Spa']
    df['sum-2'] = df['VRDeck'] + df['ShoppingMall']
    df['sum-3'] = df['VRDeck'] + df['FoodCourt']
    df['sum-4'] = df['VRDeck'] + df['RoomService']
    df['sum-5'] = df['Spa'] + df['ShoppingMall']
    df['sum-6'] = df['Spa'] + df['FoodCourt']
    df['sum-7'] = df['Spa'] + df['RoomService']
    df['sum-8'] = df['ShoppingMall'] + df['FoodCourt']
    df['sum-9'] = df['ShoppingMall'] + df['RoomService']
    df['sum-10'] = df['FoodCourt'] + df['RoomService']
    df['sum-11'] = df['VRDeck'] + df['Spa'] + df['ShoppingMall'] + df['FoodCourt'] + df['RoomService']
    return df

In [7]:
df = transform_data(df)

In [8]:
df.shape

(7441, 60)

In [9]:
df.head()

Unnamed: 0,Transported,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet__Earth,HomePlanet__Europa,...,sum-2,sum-3,sum-4,sum-5,sum-6,sum-7,sum-8,sum-9,sum-10,sum-11
0,0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,1,0,...,69.0,53.0,153.0,574.0,558.0,658.0,34.0,134.0,118.0,736.0
2,0,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,0,1,...,49.0,3625.0,92.0,6715.0,10291.0,6758.0,3576.0,43.0,3619.0,10383.0
3,0,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,0,1,...,564.0,1476.0,193.0,3700.0,4612.0,3329.0,1654.0,371.0,1283.0,5176.0
4,1,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,1,0,...,153.0,72.0,305.0,716.0,635.0,868.0,221.0,454.0,373.0,1091.0


In [10]:
df.dtypes

Transported                     int64
PassengerId                    object
Age                           float64
RoomService                   float64
FoodCourt                     float64
ShoppingMall                  float64
Spa                           float64
VRDeck                        float64
HomePlanet__Earth               int64
HomePlanet__Europa              int64
HomePlanet__Mars                int64
Destination__55 Cancri e        int64
Destination__PSO J318.5-22      int64
Destination__TRAPPIST-1e        int64
Num                             int64
Deck_A                          int64
Deck_B                          int64
Deck_C                          int64
Deck_D                          int64
Deck_E                          int64
Deck_F                          int64
Deck_G                          int64
Deck_T                          int64
Side__P                         int64
Side__S                         int64
VIP__False                      int64
VIP__True   

# Split data into Training and Cross Validation sets

In [11]:
# Shuffle data & create a DF without the id
df.sample(frac=1).reset_index(drop=True)

# Create new dataframe without passenger id
df_no_passenger_id = df.drop('PassengerId', axis=1)

In [12]:
def normalize_data(df):
    # split dataframe into float and non-float columns
    df_float = df.select_dtypes(include=['float64'])
    df_int = df.select_dtypes(exclude=['float64'])

    # convert float dataframe to numpy array
    float_array = df_float.to_numpy()

    # normalize the float numpy array
    normalizer = tf.keras.layers.Normalization(axis=-1)
    normalizer.adapt(float_array)
    normalized_array = normalizer(float_array)

    # convert normalized numpy array back to dataframe
    df_normalized = pd.DataFrame(normalized_array, index=df_float.index, columns=df_float.columns)

    # concatenate normalized dataframe back with int dataframe
    df_new = pd.concat([df_int, df_normalized], axis=1)
    return df_new

In [13]:
df_new = normalize_data(df_no_passenger_id)

In [14]:
df_new.head()

Unnamed: 0,Transported,HomePlanet__Earth,HomePlanet__Europa,HomePlanet__Mars,Destination__55 Cancri e,Destination__PSO J318.5-22,Destination__TRAPPIST-1e,Num,Deck_A,Deck_B,...,sum-2,sum-3,sum-4,sum-5,sum-6,sum-7,sum-8,sum-9,sum-10,sum-11
0,0,0,1,0,0,0,1,0,0,1,...,-0.374747,-0.353759,-0.411028,-0.374802,-0.356861,-0.407607,-0.368794,-0.434729,-0.3949,-0.523452
1,1,1,0,0,0,0,1,0,0,0,...,-0.32045,-0.329515,-0.291269,0.066224,-0.103169,0.092123,-0.349428,-0.288823,-0.327984,-0.264056
2,0,0,1,0,0,0,1,0,1,0,...,-0.336188,1.304476,-0.339016,4.784596,4.321876,4.724882,1.668039,-0.387908,1.657371,3.135938
3,0,0,1,0,0,0,1,0,1,0,...,0.069073,0.321428,-0.259959,2.468053,1.739955,2.120664,0.573298,-0.030766,0.332667,1.30078
4,1,1,0,0,0,0,1,1,0,0,...,-0.254349,-0.320823,-0.172292,0.175329,-0.068162,0.251612,-0.242916,0.059609,-0.183378,-0.138939


In [15]:
Y = df_new.to_numpy()[:, 0, np.newaxis]
X = df_new.to_numpy()[:, 1:]

X_train, X_cv, Y_train, Y_cv = train_test_split(X, Y, test_size=0.20, random_state=1)

In [16]:
X_train.shape

(5952, 58)

In [17]:
Y_train.shape

(5952, 1)

In [18]:
X_cv.shape

(1489, 58)

In [19]:
Y_cv.shape

(1489, 1)

# Build a Neural Network

In [20]:
# Create a custom callback to display output every x epochs
class EveryXEpochs(Callback):
    def __init__(self, x):
        super(EveryXEpochs, self).__init__()
        self.x = x

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.x == 0:
            print(f"Epoch {epoch + 1}: loss = {logs['loss']:.4f}, acc = {logs['accuracy']:.4f}")

In [21]:
verbose_callback = EveryXEpochs(x=100)

In [23]:
layer_1 = Dense(units=64,  activation='relu',    name='layer1', kernel_regularizer=L2(0.0001))
layer_2 = Dense(units=32,  activation='relu',    name='layer2', kernel_regularizer=L2(0.0001))
layer_3 = Dense(units=64,  activation='relu',    name='layer3', kernel_regularizer=L2(0.0001))
layer_4 = Dense(units=32,  activation='relu',    name='layer4', kernel_regularizer=L2(0.0001))
layer_5 = Dense(units=1,   activation='linear',  name='output', kernel_regularizer=L2(0.0001)) 

# applied to achieve consistent results
tf.random.set_seed(1234)  

model = Sequential([
    tf.keras.Input(shape=(X_train.shape[1],)),
    layer_1, 
    layer_2,
    layer_3,
    layer_4,
    layer_5
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
    loss=BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'] 
)

model.fit(X_train, Y_train, epochs=1000, callbacks=[verbose_callback], verbose=0)

Epoch 100: loss = 0.4527, acc = 0.7829
Epoch 200: loss = 0.4411, acc = 0.7863
Epoch 300: loss = 0.4330, acc = 0.7893
Epoch 400: loss = 0.4279, acc = 0.7893
Epoch 500: loss = 0.4213, acc = 0.7912
Epoch 600: loss = 0.4210, acc = 0.7928
Epoch 700: loss = 0.4138, acc = 0.7932
Epoch 800: loss = 0.4076, acc = 0.7960
Epoch 900: loss = 0.4099, acc = 0.7969
Epoch 1000: loss = 0.4047, acc = 0.8001


<keras.src.callbacks.History at 0x13dd6ab90>

# Use model to predict

In [24]:
# Inference - use model to predict
result_train = np.where(tf.nn.sigmoid(model.predict(X_train)) > 0.5, 1, 0)
result_cv    = np.where(tf.nn.sigmoid(model.predict(X_cv)) > 0.5, 1, 0)



In [25]:
r_train = np.concatenate((result_train, Y_train), axis=1)
r_cv    = np.concatenate((result_cv, Y_cv), axis=1)

In [26]:
diff_train = r_train[:, 1] - r_train[:, 0]
diff_cv    = r_cv[:, 1] - r_cv[:, 0]

# count the number of zeros
J_train = 100 - (np.count_nonzero(diff_train == 0) / r_train.shape[0]) * 100
J_cv    = 100 - (np.count_nonzero(diff_cv == 0) / r_cv.shape[0]) * 100

In [27]:
print(J_train, J_cv)

18.313172043010752 19.140362659503012


# Create output file with Test Data predictions

In [42]:
df_test = pd.read_csv("../data/test.csv")
df_test = transform_data(df_test, test_data=True)
df.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet__Earth,HomePlanet__Europa,HomePlanet__Mars,...,sum-2,sum-3,sum-4,sum-5,sum-6,sum-7,sum-8,sum-9,sum-10,sum-11
0,0013_01,27.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0018_01,19.0,0.0,9.0,0.0,2823.0,0.0,1,0,0,...,0.0,9.0,0.0,2823.0,2832.0,2823.0,9.0,0.0,9.0,2832.0
2,0019_01,31.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0021_01,38.0,0.0,6652.0,0.0,181.0,585.0,0,1,0,...,585.0,7237.0,585.0,181.0,6833.0,181.0,6652.0,0.0,6652.0,7418.0
4,0023_01,20.0,10.0,0.0,635.0,0.0,0.0,1,0,0,...,635.0,0.0,10.0,635.0,0.0,10.0,635.0,645.0,10.0,645.0


In [49]:
df_no_passenger_id_test = df_test.drop('PassengerId', axis=1)
df_new_test = normalize_data(df_no_passenger_id_test)
X_test = df_new_test.to_numpy()
result_test = np.where(tf.nn.sigmoid(model.predict(X_test)) > 0.5, 1, 0)
results_for_submission = np.concatenate((df_test['PassengerId'].to_numpy().reshape(-1, 1), np.where(result_test == 1, True, False)), axis=1)



In [50]:
df = pd.DataFrame(results_for_submission, columns=['PassengerId', 'Transported'])

In [51]:
df.to_csv("../data/submission.csv", index=False)