In [17]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score 
from tensorflow import keras

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

np.random.seed(42)
tf.random.set_seed(42)

# Read and prepare file
data = pd.read_table('Titanic.csv', delimiter=',', header=0)
age_mean = data['age'].mean()
sibsp_mean = data['sibsp'].mean()
fare_mean = data['fare'].mean()
body_mean = data['body'].mean()

# Gender / Age / sibsp / pclass
data.age = data.age.fillna(value=age_mean)
data.sibsp = data.sibsp.fillna(value=sibsp_mean)
data.fare = data.fare.fillna(value=fare_mean)
data.body = data.body.fillna(value=body_mean)
data_adj = data.drop(['name', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'], axis=1)
data_adj = data_adj.drop(data_adj.columns[0], axis=1)

# Convert strings to discrete values
le = preprocessing.LabelEncoder()
le.fit(data_adj['pclass'])
data_adj['pclass'] = le.transform(data['pclass'])
le.fit(data_adj['sex'])
data_adj['sex'] = le.transform(data_adj['sex'])

y = data_adj['survived']
x = data_adj.drop(['survived'], axis=1)
norm = MinMaxScaler().fit(x)
x_norm = norm.transform(x)

# Split Data 80:20 (Train:Test)
x_train, x_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.2, random_state=42)

In [18]:
# Model 1
# units = nodes
# pclass, sex, age, sibsp
pclass_num = len(set(data_adj['pclass']))
sex_num = len(set(data_adj['sex']))
age_num = len(set(data_adj['age']))
sibsp_num = len(set(data_adj['sibsp']))
tot_nodes = pclass_num + sex_num + age_num + sibsp_num

model = tf.keras.Sequential()
model.add(tf.keras.layers.BatchNormalization())
model.add(keras.layers.Dense(tot_nodes,input_shape=x_train.shape[1:]))
model.add(keras.layers.Dense(tot_nodes, activation='sigmoid'))
model.add(keras.layers.Dense(50, activation='sigmoid'))
model.add(keras.layers.Dense(1, activation='sigmoid', input_shape=x_train.shape[1:]))

model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(x_train,y_train,batch_size=32,epochs = 50)
#model.summary()
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int).ravel()
# accuracy(% survivors correctly predicted)  TP/(TP + FN)
mod1_surv = recall_score(y_test, y_pred, pos_label=1)

# accuracy(% fatalities correctly predicted)  TN/(TN + FP)
mod1_fat = recall_score(y_test, y_pred, pos_label=0)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
# Model 2
model1 = tf.keras.Sequential()
model1.add(tf.keras.layers.BatchNormalization())
model1.add(keras.layers.Dense(tot_nodes,input_shape=x_train.shape[1:]))
model1.add(keras.layers.Dense(50, activation='sigmoid'))
model1.add(keras.layers.Dense(4, activation='sigmoid'))
model1.add(keras.layers.Dense(1, activation='sigmoid', input_shape=x_train.shape[1:]))


model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model1.fit(x_train,y_train,batch_size=32,epochs = 50)
#model.summary()
y_pred = model1.predict(x_test)
y_pred = (y_pred > 0.5).astype(int).ravel()
# accuracy(% survivors correctly predicted)  TP/(TP + FN)
mod2_surv = recall_score(y_test, y_pred, pos_label=1)

# accuracy(% fatalities correctly predicted)  TN/(TN + FP)
mod2_fat = recall_score(y_test, y_pred, pos_label=0)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
from sklearn.ensemble import RandomForestClassifier
from tabulate import tabulate

rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=40, n_jobs=-1)
rnd_clf.fit(x_train, y_train)
y_pred_rf = rnd_clf.predict(x_test)

# accuracy(% survivors correctly predicted)   TP/(TP + FN)
random_forest_surv = recall_score(y_test, y_pred_rf, pos_label=1)

# accuracy(% fatalities correctly predicted)   TN/(TN + FP)
random_forest_fat = recall_score(y_test, y_pred_rf, pos_label=0)

table = [['Learning Model', 'Accuracy Survivors Correctly Predicted', 'Accuracy Fatalities Correctly Predicted'],
        ['Model #1', mod1_surv, mod1_fat],
        ['Model #2', mod2_surv, mod2_fat],
        ['Random Forest', random_forest_surv, random_forest_fat]]

print(tabulate(table, headers='firstrow'))

Learning Model      Accuracy Survivors Correctly Predicted    Accuracy Fatalities Correctly Predicted
----------------  ----------------------------------------  -----------------------------------------
Model #1                                          0.627119                                   0.875
Model #2                                          0.550847                                   0.923611
Random Forest                                     0.576271                                   0.895833
