# Goal
Using passenger information of *Spaceship Titanic*, predict which passengers were transported to a different dimension \
Prediction done using logistic regression

# Table of Contents
1. Loading the data
2. Preprocessing
3. Defining the model
4. Model training
5. Model testing

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Loading the data

In [3]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

Take a look at the data

In [4]:
df_train

In [5]:
df_train.isnull().sum()

In [6]:
df_test

In [7]:
df_test.isnull().sum()

14 columns, including label 'Transported' \
Label is binary T/F, so will use **logistic regression** to predict status of 'Transported' 

Not all columns likely to correlate with label. \
Name, HomePlanet, Destination very unlikely to correlate \
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck questionable correlation\
Will want to make a cross correlation matrix to get a sense of the data. Will also need to encode features and normalize 

# Preprocessing
Need to transform much of the data to something machine-readable. \
Will do 1-hot encoding and normalization of features

<br>

In [8]:
def encodePassengerId(df):
    '''
    Encodes PassengerId by pulling the group number, i.e. first 4 digits of PassengerID, and returns as scalar
    Also normalizes the array of IDs
    '''
    my_array = [i[:4] for i in df['PassengerId']]
    my_array = np.array(my_array, dtype=float)
    
    avg = np.average(my_array)
    std = np.std(my_array)
    my_array = (my_array-avg)/std
    
    df['PassengerId'] = my_array
    
    return df

def encodeHomePlanet(df):
    '''
    1-hot encoding of HomePlanet.
    Will remove column HomePlanet and add a new column for each planet (Earth, Europa, Mars). 
    Entries without a HomePlanet will be assigned value 0 for all columns
    '''
    (m, n) = df.shape
    planets = list(set(df['HomePlanet']))

    planets_to_int = dict((p, i) for i, p in enumerate(planets))
    int_to_planets = dict((i, p) for i, p in enumerate(planets))

    encoded_planets = [planets_to_int[planet] for planet in df['HomePlanet']]

    one_hot = np.zeros([m, len(planets)])
    for i in range(m):
        one_hot[i,encoded_planets[i]] = 1

    start_loc = df.columns.get_loc('HomePlanet')
    for i in range(len(planets)):
        if str(planets[i]).lower() != 'nan':
            df.insert(loc=planets_to_int[planets[i]]+start_loc, column=planets[i], value=one_hot[:,planets_to_int[planets[i]]])
    df = df.drop('HomePlanet', axis=1)

    return df

def encodeCryoSleep(df):
    '''
    Encodes CryoSleep from True/False to 1/0
    '''
    df['CryoSleep'] = [float(a) for a in df['CryoSleep']]
    df['CryoSleep'].fillna(method='bfill', inplace=True)
    
    return df

def encodeCabin(df):
    '''
    Encodes Cabin information from deck/num/side to 1-hot of deck, normalized value of num, and 1-hot of side
    '''
    m = df.shape[0]
    nan = float('NaN')

    decks = [None]*m
    num = [None]*m
    sides = [None]*m
    for i in range(m):
        if str(df['Cabin'][i]).lower() == 'nan':
            decks[i] = nan
            num[i] = nan
            sides[i] = nan
        else:
            decks[i] = df['Cabin'][i][0]
            num[i] = float(df['Cabin'][i][2:-2])
            sides[i] = df['Cabin'][i][-1]
    unique_decks = list(set(decks))
    unique_sides = list(set(sides))

    # 1-hot encoding for side
    sides_to_int = dict((s,i) for i,s in enumerate(unique_sides))
    int_to_sides = dict((i,s) for i,s in enumerate(unique_sides))

    encoded_sides = [sides_to_int[side] for side in sides]

    one_hot = np.zeros([m, len(unique_sides)])
    for i in range(m):
        one_hot[i,encoded_sides[i]] = 1

    start_loc = df.columns.get_loc('Cabin')
    for i in range(len(unique_sides)):
        if str(unique_sides[i]).lower() != 'nan':
            df.insert(loc=sides_to_int[unique_sides[i]]+start_loc, column='Side '+str(unique_sides[i]),
                     value=one_hot[:,sides_to_int[unique_sides[i]]])

    # encode and normalize num
    avg = np.nanmean(num)
    std = np.nanstd(num)
    num = (num-avg)/std

    start_loc = df.columns.get_loc('Cabin')
    df.insert(loc=start_loc, column='Num', value=num)
    df['Num'].fillna(0, inplace=True)

    # 1-hot encoding for deck value
    decks_to_int = dict((d, i) for i, d in enumerate(unique_decks))
    int_to_decks = dict((i, d) for i, d in enumerate(unique_decks))

    encoded_decks = [decks_to_int[deck] for deck in decks]

    one_hot = np.zeros([m, len(unique_decks)])
    for i in range(m):
        one_hot[i,encoded_decks[i]] = 1

    start_loc = df.columns.get_loc('Cabin')
    for i in range(len(unique_decks)):
        if str(unique_decks[i]).lower() != 'nan':
            df.insert(loc=decks_to_int[unique_decks[i]]+start_loc, column='Deck '+str(unique_decks[i]),
                      value=one_hot[:,decks_to_int[unique_decks[i]]])

    df = df.drop('Cabin', axis=1)
    return df

def encodeDestination(df):
    '''
    1-hot encoding of Destination.
    Will remove column Destination and add a new column for each planet ('55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e'). 
    Entries without a Destination will be assigned value 0 for all columns
    '''
    (m, n) = df.shape
    planets = list(set(df['Destination']))

    planets_to_int = dict((p, i) for i, p in enumerate(planets))
    int_to_planets = dict((i, p) for i, p in enumerate(planets))

    encoded_planets = [planets_to_int[planet] for planet in df['Destination']]

    one_hot = np.zeros([m, len(planets)])
    for i in range(m):
        one_hot[i,encoded_planets[i]] = 1

    start_loc = df.columns.get_loc('Destination')
    for i in range(len(planets)):
        if str(planets[i]).lower() != 'nan':
            df.insert(loc=planets_to_int[planets[i]]+start_loc, column=planets[i], value=one_hot[:,planets_to_int[planets[i]]])
    df = df.drop('Destination', axis=1)

    return df

def encodeAge(df):
    '''
    normalizes Age
    '''
    age = [float(i) for i in df['Age']]

    avg = np.nanmean(age)
    std = np.nanstd(age)
    age = (age-avg)/std

    df['Age'] = age
    df['Age'].fillna(0, inplace=True)
    return df

def encodeVIP(df):
    '''
    Encodes VIP from True/False to 1/0
    '''
    df['VIP'] = [float(a) for a in df['VIP']]
    df['VIP'].fillna(method='bfill', inplace=True)
    
    return df

def encodeRoomService(df):
    '''
    normalizes RoomService
    '''
    data = [float(i) for i in df['RoomService']]

    avg = np.nanmean(data)
    std = np.nanstd(data)
    data = (data-avg)/std

    df['RoomService'] = data
    df['RoomService'].fillna(0, inplace=True)
    return df

def encodeFoodCourt(df):
    '''
    normalizes FoodCourt
    '''
    data = [float(i) for i in df['FoodCourt']]

    avg = np.nanmean(data)
    std = np.nanstd(data)
    data = (data-avg)/std

    df['FoodCourt'] = data
    df['FoodCourt'].fillna(0, inplace=True)
    return df

def encodeShoppingMall(df):
    '''
    normalizes ShoppingMall
    '''
    data = [float(i) for i in df['ShoppingMall']]

    avg = np.nanmean(data)
    std = np.nanstd(data)
    data = (data-avg)/std

    df['ShoppingMall'] = data
    df['ShoppingMall'].fillna(0, inplace=True)
    return df

def encodeSpa(df):
    '''
    normalizes Spa
    '''
    data = [float(i) for i in df['Spa']]

    avg = np.nanmean(data)
    std = np.nanstd(data)
    data = (data-avg)/std

    df['Spa'] = data
    df['Spa'].fillna(0, inplace=True)
    return df

def encodeVRDeck(df):
    '''
    normalizes VRDeck
    '''
    data = [float(i) for i in df['VRDeck']]

    avg = np.nanmean(data)
    std = np.nanstd(data)
    data = (data-avg)/std

    df['VRDeck'] = data
    df['VRDeck'].fillna(0, inplace=True)
    return df

def encodeTransported(df):
    '''
    Encodes Transported from True/False to 1/0
    '''
    df['Transported'] = [float(a) for a in df['Transported']]
    
    return df

def encodeDF(df, test=False):
    '''
    Cleans data to be machine readable
    Uses encodePassengerId, encodeHomePlanet, encodeCryoSleep, encodeCabin, encodeDestination, encodeAge,
        encodeVIP, encodeRoomService, encodeFoodCourt, encodeShoppingMall, encodeSpa, encodeVRDeck, encodeTransported
    Drops Name    
    '''
    df = encodePassengerId(df)
    df = encodeHomePlanet(df)
    df = encodeCryoSleep(df)
    df = encodeCabin(df)
    df = encodeDestination(df)
    df = encodeAge(df)
    df = encodeVIP(df)
    df = encodeRoomService(df)
    df = encodeFoodCourt(df)
    df = encodeShoppingMall(df)
    df = encodeSpa(df)
    df = encodeVRDeck(df)
    if test==False:
        df = encodeTransported(df)

    df = df.drop('Name', axis=1)
    
    return df

In [9]:
df_train = encodeDF(df_train)
df_test = encodeDF(df_test, test=True)

In [10]:
df_train.iloc[:,:]

In [12]:
df_train.corr().iloc[-1,:].sort_values()

Now dataframe has potentially 26 features to learn from. Will likely need to knockout some of these, but for the time being, let's define a model with 26 inputs.

<br>

In [13]:
df_train.isnull().sum()

# Defining a model

Starting with fully connected feedforward network with 2 hidden layers and 1 output layer

In [14]:
x_train, x_test, y_train, y_test = train_test_split(df_train.iloc[:,:-1],df_train.iloc[:,-1],test_size=0.20,random_state=0)

In [15]:
n = df_train.shape[1] - 1
inputs = tf.keras.Input(shape=(n,))
x = tf.keras.layers.Dropout(.2)(inputs)
x = tf.keras.layers.Dense(26, activation='relu')(inputs)
x = tf.keras.layers.Dense(13, activation='relu')(x)
# x = tf.keras.layers.Dense(7, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs, name='2_26_13')

In [16]:
model.summary()

In [17]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.001,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,
    amsgrad=False,
    name='Adam'
),
    metrics=["accuracy"],
)

history = model.fit(x_train, y_train, batch_size=128, epochs=250, validation_split=0.2, verbose=False)

results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

In [18]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(loc=1)
plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(loc=4)

Notes from trying a few iterations : 

~75% accuracy using CryoSleep, VRDeck, RoomService, Spa, FoodCourt, ShoppingMall\
~72% accuracy using CryoSleep, VRDeck, RoomService, Spa \
~76% accuracy dropping columns with less than 0.05 correlation coeff. Less overfitting\
~76% accuracy dropping columns with less than 0.025 correlation coeff and 0.2 dropout\
~76% accuracy using all columns and 4 layers with 64 nodes each\
~78% accuracy using all columns and 0.2 dropout [26, 13, 7, 1]\
~80% accuracy using all columns and 0.2 dropout [26,13,1]

In [20]:
df_test

In [19]:
df_test.isnull().sum()

In [28]:
y_pred = model.predict(df_test)
y_pred = y_pred > 0.5
y_pred = y_pred.reshape(-1,)

In [31]:
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [32]:
df_submit = pd.DataFrame({'PassengerId':df_test['PassengerId'], 'Transported':y_pred})

In [38]:
df_submit.to_csv('titanic_AVong.csv', index=False)

In [39]:
df = pd.read_csv('titanic_AVong.csv')

In [40]:
df