In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from IPython.display import Image  
import pydotplus
from sklearn.model_selection import GridSearchCV
import warnings
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


In [None]:
#Here we read the Titanic.csv file and for the features that have some values missing, we fill in the average values
#using the average of the same feature of other samples.

In [28]:
df = pd.read_csv('Titanic.csv')
df = df[['pclass', 'sex', 'age','sibsp','survived']] 
df['pclass'] = df['pclass'].map({'1st':1,'2nd':2,'3rd':3})
df['sex'] = df['sex'].map({'male':0,'female':1})
df['age'].fillna((df['age'].mean()),inplace=True)
df.head()

Unnamed: 0,pclass,sex,age,sibsp,survived
0,1,1,29.0,0,1
1,1,0,0.9167,1,1
2,1,1,2.0,1,0
3,1,0,30.0,1,0
4,1,1,25.0,1,0


In [None]:
#Here we randomly split the dataset into 80% training set and 20% test set

In [29]:
X = df.drop('survived',axis=1)
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#Here we use a neural network with 2 layers and display its accuracy.

In [30]:
classifier = MLPClassifier(solver='adam',hidden_layer_sizes=(2,),activation='logistic',learning_rate='constant',
                           learning_rate_init=0.1,alpha=0.00000001, momentum=0.4)
classifier = classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.7595419847328244

In [None]:
#Performance of model with out of sample accuracy(Test Set) for 1st NN.

In [31]:
y_pred_new = classifier.predict(X_train)

#For training set
conf_matrix = confusion_matrix(y_train, y_pred_new)
TN = conf_matrix[0][0]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
FP = conf_matrix[0][1]
sensitivity_train = TP/(TP+FN)
specificity_train = TN/(FP+TN)

#for testing set
conf_matrix = confusion_matrix(y_test, y_pred)
TN = conf_matrix[0][0]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
FP = conf_matrix[0][1]
sensitivity_test = TP/(TP+FN)
specificity_test = TN/(FP+TN)

print("out‐of‐sample percent survivors correctly predicted (on testing set) : {0:.2f}%\n".format(sensitivity_test*100))
print("out-of‐sample percent fatalities correctly predicted (on testing set) : {0:.2f}%\n".format(specificity_test*100))

out‐of‐sample percent survivors correctly predicted (on testing set) : 42.45%

out-of‐sample percent fatalities correctly predicted (on testing set) : 98.72%



In [None]:
#Here we use a neural network with 100 layers and display its accuracy.

In [32]:
classifier = MLPClassifier(solver='adam',hidden_layer_sizes=(100,),activation='logistic',learning_rate='constant',
                           learning_rate_init=0.1,alpha=0.00000001, momentum=0.4)
classifier = classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.8015267175572519

In [None]:
#Performance of model with out of sample accuracy(Test Set) for 2nd NN.

In [33]:
y_pred_new = classifier.predict(X_train)

#For training set
conf_matrix = confusion_matrix(y_train, y_pred_new)
TN = conf_matrix[0][0]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
FP = conf_matrix[0][1]
sensitivity_train = TP/(TP+FN)
specificity_train = TN/(FP+TN)

#for testing set
conf_matrix = confusion_matrix(y_test, y_pred)
TN = conf_matrix[0][0]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
FP = conf_matrix[0][1]
sensitivity_test = TP/(TP+FN)
specificity_test = TN/(FP+TN)

print("out‐of‐sample percent survivors correctly predicted (on testing set) : {0:.2f}%\n".format(sensitivity_test*100))
print("out-of‐sample percent fatalities correctly predicted (on testing set) : {0:.2f}%\n".format(specificity_test*100))


out‐of‐sample percent survivors correctly predicted (on testing set) : 62.26%

out-of‐sample percent fatalities correctly predicted (on testing set) : 92.31%



In [34]:
df = pd.DataFrame(columns=['Parameter', 'Random Forest', '2 HL NN','100 HL NN'])
df = df.append({'Parameter': 'out‐of‐sample percent survivors correctly predicted (on testing set)', 'Random Forest': '62.62%', '2 HL NN': '42.45%','100 HL NN':'62.26%'}, ignore_index=True)
df = df.append({'Parameter': 'out-of‐sample percent fatalities correctly predicted (on testing set)', 'Random Forest': '82.58%', '2 HL NN': '98.72%','100 HL NN':'92.31%'}, ignore_index=True)
pd.set_option('display.max_colwidth', -1)
df

Unnamed: 0,Parameter,Random Forest,2 HL NN,100 HL NN
0,out‐of‐sample percent survivors correctly predicted (on testing set),62.62%,42.45%,62.26%
1,out-of‐sample percent fatalities correctly predicted (on testing set),82.58%,98.72%,92.31%


In [None]:
#Having trained Rnadom Forest as well as Neural Networks multiple times, we cannot distinctively state whether
#Random Forest or Neural Network will have a better accuracy.
#The Random Forest is an ensemble of Decision Trees.
#The neural network is a network of connected neurons.
#We can keep a general rule in mind that 
#If you are going to work with tabular data, it is worth to check the Random Forests first because it is easier.
#The Random Forests requires less preprocessing and the training process is simpler.
#Therefore, it is simpler to use RF in the production system.
#If you are not satisfied with the model performance you should try to tune and train Neural Network.
#There are many hyperparameters which can be tuned in NN 
#and if you have enough knowledge and experience you can obtain very good results with NN.