In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading Titanic dataset, slicing into required columns, mapping strings to integer values and replacing NA with mean values in age

data = pd.read_csv("Titanic.csv")
data = data[["pclass", "sex", "age","sibsp","survived"]] 
data["pclass"] = data["pclass"].map({"1st":1,"2nd":2,"3rd":3})
data["sex"] = data["sex"].map({"male":0,"female":1})
data["age"].fillna((data["age"].mean()),inplace=True)
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 1308
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   sex       1309 non-null   int64  
 2   age       1309 non-null   float64
 3   sibsp     1309 non-null   int64  
 4   survived  1309 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 61.4 KB


In [3]:
# Train test splitting the dataset

x = data.drop("survived", axis = 1)
y = data["survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 18)

print("Size of training data is ",len(x_train))
print("Size of test data is ",len(x_test))

Size of training data is  1047
Size of test data is  262


In [4]:
# Here we use a neural network with 2 layers and display its accuracy
# Activation function logistic is not accepted, hence we use Relu

neural = MLPClassifier(solver='adam',hidden_layer_sizes=(2,),activation='logistic',learning_rate='constant',
                           learning_rate_init=0.1,alpha=0.00000001, momentum=0.4)

neural = neural.fit(x_train, y_train)
y_pred = neural.predict(x_test)

print("The accuracy of the model is ",accuracy_score(y_test, y_pred)*100,"%")

conf_matrix = confusion_matrix(y_test, y_pred)

TN = conf_matrix[0][0]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
FP = conf_matrix[0][1]

sensitivity1 = TP/(TP+FN)
specificity1 = TN/(FP+TN)

print("out-of-sample percent survivors correctly predicted (on test set) :",sensitivity1*100,"%")
print("out-of‐sample percent fatalities correctly predicted (on test set) :",specificity1*100,"%")

The accuracy of the model is  73.66412213740458 %
out-of-sample percent survivors correctly predicted (on test set) : 50.45871559633027 %
out-of‐sample percent fatalities correctly predicted (on test set) : 90.19607843137256 %


In [5]:
# Now building a neural network with more layers, to check accuracy

neural = MLPClassifier(solver='adam',hidden_layer_sizes=(1000,),activation='logistic',learning_rate='constant',
                           learning_rate_init=0.1,alpha=0.00000001, momentum=0.4)

neural = neural.fit(x_train, y_train)
y_pred = neural.predict(x_test)


print("The accuracy of the model is ",accuracy_score(y_test, y_pred)*100,"%")

conf_matrix = confusion_matrix(y_test, y_pred)

TN = conf_matrix[0][0]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
FP = conf_matrix[0][1]

sensitivity2 = TP/(TP+FN)
specificity2 = TN/(FP+TN)

print("out-of-sample percent survivors correctly predicted (on test set) :",sensitivity2*100,"%")
print("out-of‐sample percent fatalities correctly predicted (on test set) :",specificity2*100,"%")

The accuracy of the model is  75.95419847328245 %
out-of-sample percent survivors correctly predicted (on test set) : 54.12844036697248 %
out-of‐sample percent fatalities correctly predicted (on test set) : 91.50326797385621 %


In [6]:
# Now building the same random forest as HW3 to compare

forest = RandomForestClassifier(max_leaf_nodes = 7,n_estimators = 50, random_state = 18)
forest = forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)

print("The accuracy of the model is ",accuracy_score(y_test, y_pred)*100,"%")

conf_matrix = confusion_matrix(y_test, y_pred)

TN = conf_matrix[0][0]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
FP = conf_matrix[0][1]

sensitivity3 = TP/(TP+FN)
specificity3 = TN/(FP+TN)

print("out-of-sample percent survivors correctly predicted (on test set) :",sensitivity3*100,"%")
print("out-of‐sample percent fatalities correctly predicted (on test set) :",specificity3*100,"%")

The accuracy of the model is  76.33587786259542 %
out-of-sample percent survivors correctly predicted (on test set) : 67.88990825688074 %
out-of‐sample percent fatalities correctly predicted (on test set) : 82.35294117647058 %


In [7]:
data = [["Neural Network with 2 layers",sensitivity1,specificity1],["Neural Network with 1000 layers",sensitivity2,specificity2],["Random Forest from HW3",sensitivity3,specificity3]]
df = pd.DataFrame(data, columns=['Type of model','Survivor', 'Fatality'])
df

Unnamed: 0,Type of model,Survivor,Fatality
0,Neural Network with 2 layers,0.504587,0.901961
1,Neural Network with 1000 layers,0.541284,0.915033
2,Random Forest from HW3,0.678899,0.823529
