# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import L2
from tensorflow.keras.losses import BinaryCrossentropy

2023-07-20 17:28:25.946761: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load data

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

# Cleanup and transform the data
* First we can drop the following columns since those cannot possibly contribute to being transported to alternate dimension:
  * Name
  * HomePlanet
  * Destination
  * Age
  * VIP
* Next, we can get rid of a row if the following data is missing, because those columns tell us something about the possible location of a passenger on the ship:
  * PassengerId
  * CryoSleep
  * Cabin
  * RoomService
  * FoodCourt
  * ShoppingMall
  * Spa
  * VRDeck
* We can also get rid of a row if we do not know whether or not the passenger was Transported.
* We will create a couple of new columns:
  * Deck, Num and IsPort will replace Cabin
  * Group will be a new column based om the digits before _ character in PassengerId column
* Non-numeric values will be mapped to numeric values

In [5]:
df.shape

(8693, 14)

In [6]:
def transform_data(df):
    df.drop('Name', axis=1, inplace=True)
    df.dropna(subset=['Transported', 'Cabin'], inplace=True)
    df['RoomService'].fillna(df['RoomService'].median(), inplace=True)
    df['FoodCourt'].fillna(df['FoodCourt'].median(), inplace=True)
    df['ShoppingMall'].fillna(df['ShoppingMall'].median(), inplace=True)
    df['Spa'].fillna(df['Spa'].median(), inplace=True)
    df['VRDeck'].fillna(df['VRDeck'].median(), inplace=True)
    df['Age'].fillna(df['VRDeck'].mean(), inplace=True)
    one_hot_home = pd.get_dummies(df['HomePlanet'], prefix='HomePlanet_').astype(int)
    df.drop('HomePlanet', axis=1, inplace=True)
    df = df.join(one_hot_home)
    one_hot_dest = pd.get_dummies(df['Destination'], prefix='Destination_').astype(int)
    df.drop('Destination', axis=1, inplace=True)
    df = df.join(one_hot_dest)
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df.drop('Cabin', axis=1, inplace=True)
    df['Num'] = pd.to_numeric(df['Num'], errors='coerce')
    one_hot_deck = pd.get_dummies(df['Deck'], prefix='Deck').astype(int)
    df.drop('Deck', axis=1, inplace=True)
    df = df.join(one_hot_deck)
    one_hot_side = pd.get_dummies(df['Side'], prefix='Side_').astype(int)
    df.drop('Side', axis=1, inplace=True)
    df = df.join(one_hot_side)
    one_hot_vip = pd.get_dummies(df['VIP'], prefix='VIP_').astype(int)
    df.drop('VIP', axis=1, inplace=True)
    df = df.join(one_hot_vip)
    one_hot_sleep = pd.get_dummies(df['CryoSleep'], prefix='CryoSleep_').astype(int)
    df.drop('CryoSleep', axis=1, inplace=True)
    df = df.join(one_hot_sleep)
    df['Transported'] = pd.to_numeric(df['Transported'], errors='coerce')
    col_transported = df['Transported'].astype(int)
    df.drop(columns='Transported', inplace=True)
    df.insert(0, 'Transported', col_transported)
    # Polynomials
    df['RoomService^2'] = pow(df['RoomService'], 2)
    df['RoomService^3'] = pow(df['RoomService'], 3)
    df['RoomService^4'] = pow(df['RoomService'], 4)
    df['RoomService^5'] = pow(df['RoomService'], 5)
    df['FoodCourt^2'] = pow(df['FoodCourt'], 2)
    df['FoodCourt^3'] = pow(df['FoodCourt'], 3)
    df['FoodCourt^4'] = pow(df['FoodCourt'], 4)
    df['FoodCourt^5'] = pow(df['FoodCourt'], 5)
    df['ShoppingMall^2'] = pow(df['ShoppingMall'], 2)
    df['ShoppingMall^3'] = pow(df['ShoppingMall'], 3)
    df['ShoppingMall^4'] = pow(df['ShoppingMall'], 4)
    df['ShoppingMall^5'] = pow(df['ShoppingMall'], 5)
    df['Spa^2'] = pow(df['Spa'], 2)
    df['Spa^3'] = pow(df['Spa'], 3)
    df['Spa^4'] = pow(df['Spa'], 4)
    df['Spa^5'] = pow(df['Spa'], 5)
    df['VRDeck^2'] = pow(df['VRDeck'], 2)
    df['VRDeck^3'] = pow(df['VRDeck'], 3)
    df['VRDeck^4'] = pow(df['VRDeck'], 4)
    df['VRDeck^5'] = pow(df['VRDeck'], 5)
    # Sum
    df['sum-1'] = df['VRDeck'] + df['Spa']
    df['sum-2'] = df['VRDeck'] + df['ShoppingMall']
    df['sum-3'] = df['VRDeck'] + df['FoodCourt']
    df['sum-4'] = df['VRDeck'] + df['RoomService']
    df['sum-5'] = df['Spa'] + df['ShoppingMall']
    df['sum-6'] = df['Spa'] + df['FoodCourt']
    df['sum-7'] = df['Spa'] + df['RoomService']
    df['sum-8'] = df['ShoppingMall'] + df['FoodCourt']
    df['sum-9'] = df['ShoppingMall'] + df['RoomService']
    df['sum-10'] = df['FoodCourt'] + df['RoomService']
    df['sum-11'] = df['VRDeck'] + df['Spa'] + df['ShoppingMall'] + df['FoodCourt'] + df['RoomService']
    return df

In [7]:
df = transform_data(df)

In [8]:
df.shape

(8494, 60)

In [9]:
df.head()

Unnamed: 0,Transported,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet__Earth,HomePlanet__Europa,...,sum-2,sum-3,sum-4,sum-5,sum-6,sum-7,sum-8,sum-9,sum-10,sum-11
0,0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,1,0,...,69.0,53.0,153.0,574.0,558.0,658.0,34.0,134.0,118.0,736.0
2,0,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,0,1,...,49.0,3625.0,92.0,6715.0,10291.0,6758.0,3576.0,43.0,3619.0,10383.0
3,0,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,0,1,...,564.0,1476.0,193.0,3700.0,4612.0,3329.0,1654.0,371.0,1283.0,5176.0
4,1,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,1,0,...,153.0,72.0,305.0,716.0,635.0,868.0,221.0,454.0,373.0,1091.0


In [10]:
df.dtypes

Transported                     int64
PassengerId                    object
Age                           float64
RoomService                   float64
FoodCourt                     float64
ShoppingMall                  float64
Spa                           float64
VRDeck                        float64
HomePlanet__Earth               int64
HomePlanet__Europa              int64
HomePlanet__Mars                int64
Destination__55 Cancri e        int64
Destination__PSO J318.5-22      int64
Destination__TRAPPIST-1e        int64
Num                             int64
Deck_A                          int64
Deck_B                          int64
Deck_C                          int64
Deck_D                          int64
Deck_E                          int64
Deck_F                          int64
Deck_G                          int64
Deck_T                          int64
Side__P                         int64
Side__S                         int64
VIP__False                      int64
VIP__True   

# Split data into Training and Cross Validation sets

In [11]:
# Shuffle data & create a DF without the id
df.sample(frac=1).reset_index(drop=True)

# Create new dataframe without passenger id
df_no_passenger_id = df.drop('PassengerId', axis=1)

In [12]:
# split dataframe into float and non-float columns
df_float = df_no_passenger_id.select_dtypes(include=['float64'])
df_int = df_no_passenger_id.select_dtypes(exclude=['float64'])

# convert float dataframe to numpy array
float_array = df_float.to_numpy()

# normalize the float numpy array
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(float_array)
normalized_array = normalizer(float_array)

# convert normalized numpy array back to dataframe
df_normalized = pd.DataFrame(normalized_array, index=df_float.index, columns=df_float.columns)

# concatenate normalized dataframe back with int dataframe
df_new = pd.concat([df_int, df_normalized], axis=1)

In [13]:
df_new.head()

Unnamed: 0,Transported,HomePlanet__Earth,HomePlanet__Europa,HomePlanet__Mars,Destination__55 Cancri e,Destination__PSO J318.5-22,Destination__TRAPPIST-1e,Num,Deck_A,Deck_B,...,sum-2,sum-3,sum-4,sum-5,sum-6,sum-7,sum-8,sum-9,sum-10,sum-11
0,0,0,1,0,0,0,1,0,0,1,...,-0.366447,-0.346492,-0.400146,-0.370638,-0.35055,-0.402966,-0.363414,-0.432204,-0.388922,-0.514771
1,1,1,0,0,0,0,1,0,0,0,...,-0.312646,-0.321938,-0.281688,0.081481,-0.089213,0.109485,-0.343448,-0.282858,-0.319986,-0.250892
2,0,0,1,0,0,0,1,0,1,0,...,-0.32824,1.332932,-0.328916,4.91853,4.469211,4.860165,1.736533,-0.384279,1.72533,3.207858
3,0,0,1,0,0,0,1,0,1,0,...,0.073323,0.337323,-0.250719,2.543721,1.809467,2.18966,0.60787,-0.018716,0.360618,1.340986
4,1,1,0,0,0,0,1,1,0,0,...,-0.247148,-0.313135,-0.164005,0.19333,-0.05315,0.273033,-0.233636,0.073789,-0.171012,-0.123614


In [14]:
Y = df_new.to_numpy()[:, 0, np.newaxis]
X = df_new.to_numpy()[:, 1:]

X_train, X_cv, Y_train, Y_cv = train_test_split(X, Y, test_size=0.20, random_state=1)

In [15]:
X_train.shape

(6795, 58)

In [16]:
Y_train.shape

(6795, 1)

In [17]:
X_cv.shape

(1699, 58)

In [18]:
Y_cv.shape

(1699, 1)

# Build a Neural Network

In [19]:
layer_1 = Dense(units=64,  activation='relu',    name='layer1', kernel_regularizer=L2(0.00001))
layer_2 = Dense(units=128, activation='relu',    name='layer2', kernel_regularizer=L2(0.00001))
layer_3 = Dense(units=32,  activation='relu',    name='layer3', kernel_regularizer=L2(0.00001))
layer_4 = Dense(units=16,  activation='relu',    name='layer4', kernel_regularizer=L2(0.00001))
layer_5 = Dense(units=1,   activation='linear',  name='output', kernel_regularizer=L2(0.00001)) 

# applied to achieve consistent results
tf.random.set_seed(1234)  

model = Sequential([
    tf.keras.Input(shape=(X_train.shape[1],)),
    layer_1, 
    layer_2,
    layer_3,
    layer_4,
    layer_5
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=BinaryCrossentropy(from_logits=True)
)

model.fit(X_train, Y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x1458996d0>

# Use model to predict

In [24]:
# Inference - use model to predict
result_train = np.where(tf.nn.sigmoid(model.predict(X_train)) > 0.5, 1, 0)
result_cv    = np.where(tf.nn.sigmoid(model.predict(X_cv)) > 0.5, 1, 0)



In [25]:
r_train = np.concatenate((result_train, Y_train), axis=1)
r_cv    = np.concatenate((result_cv, Y_cv), axis=1)

In [26]:
diff_train = r_train[:, 1] - r_train[:, 0]
diff_cv    = r_cv[:, 1] - r_cv[:, 0]

# count the number of zeros
J_train = 100 - (np.count_nonzero(diff_train == 0) / r_train.shape[0]) * 100
J_cv    = 100 - (np.count_nonzero(diff_cv == 0) / r_cv.shape[0]) * 100

In [27]:
print(J_train, J_cv)

20.485651214128026 22.012948793407887
