In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
import warnings
import numpy as np
from sklearn.neural_network import MLPClassifier



<h2>Checking and preparing our data</h2>

In [2]:
df = pd.read_csv(f'{os.getcwd()}/Titanic.csv', index_col= 0)

In [3]:
df.head(3)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1,1st,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,Southampton,2.0,,"St Louis, MO"
2,1st,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,Southampton,11.0,,"Montreal, PQ / Chesterville, ON"
3,1st,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"


In [4]:
df = df[['pclass', 'sex', 'age', 'sibsp', 'survived']] #Reducing features to ones we need

In [5]:
df.head(3)

Unnamed: 0,pclass,sex,age,sibsp,survived
1,1st,female,29.0,0,1
2,1st,male,0.9167,1,1
3,1st,female,2.0,1,0


In [6]:
#Replace the features with numerical values
df['pclass'].replace({'1st':1,'2nd':2,'3rd':3},inplace=True)
df['sex'] = df['sex'].replace({'female':0, 'male': 1}).astype(int)


In [7]:
df.dtypes

pclass        int64
sex           int64
age         float64
sibsp         int64
survived      int64
dtype: object

In [8]:
df.head(3)

Unnamed: 0,pclass,sex,age,sibsp,survived
1,1,0,29.0,0,1
2,1,1,0.9167,1,1
3,1,0,2.0,1,0


In [9]:
isnull = df[df.isnull().any(axis=1)]
nulls_by_feat = df.isnull().sum()
nulls_by_feat

pclass        0
sex           0
age         263
sibsp         0
survived      0
dtype: int64

In [10]:
#Changing null age values to average of age
numerical_imputer = SimpleImputer(strategy='mean')
df['age'] = numerical_imputer.fit_transform(df[['age']])

In [11]:
df.head(3)

Unnamed: 0,pclass,sex,age,sibsp,survived
1,1,0,29.0,0,1
2,1,1,0.9167,1,1
3,1,0,2.0,1,0


In [12]:
df.isnull().sum() ##Check to see if no more nulls

pclass      0
sex         0
age         0
sibsp       0
survived    0
dtype: int64

<h2>Step 1: Splitting the data</h2>

In [13]:
train_X,test_X,train_y,test_y = train_test_split(df[['pclass','sex','age','sibsp']],df['survived'],test_size=0.2,random_state=42)

<h2>Step 2: Fit Neural Network </h2>

In [14]:
hidden_layer_sizes = [(5, 4), (10,8)]  # Two different structures


In [15]:
##Creating the models and fitting

In [16]:
mlp_1 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes[0], activation='logistic', alpha=1e-3, learning_rate= 'adaptive', solver='adam', max_iter=1000, random_state=42)
mlp_2 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes[1], activation='logistic', alpha=1e-3, learning_rate= 'adaptive', solver='adam', max_iter=1000, random_state=42)

In [17]:
mlp_1.fit(train_X, train_y)

In [18]:
mlp_2.fit(train_X, train_y)

<h2>Step 3: Check Performance Model 1</h2>

In [19]:
test_pred_1 = mlp_1.predict(test_X)


In [21]:
accuracy_1 = accuracy_score(test_y, test_pred_1)
f"Overall Accuracy Model 1: {accuracy_1}"

'Overall Accuracy Model 1: 0.7366412213740458'

In [22]:
cm_1 = confusion_matrix(test_y, test_pred_1)
tn1, fp1, fn1, tp1 = cm_1.ravel() ##confusion matrix values as true negative, false positive, false negative, true positive
survivor_accuracy_1 = tp1 / (tp1 + fn1) ##calculating accuracy
fatality_accuracy_1 = tn1 / (tn1 + fp1)

In [23]:
f"Survivor Accuracy Model 1: {survivor_accuracy_1}"

'Survivor Accuracy Model 1: 0.4915254237288136'

In [24]:
f"Fatality Accuracy Model 1: {fatality_accuracy_1}"

'Fatality Accuracy Model 1: 0.9375'

<h2>Check Performance Model 2</h2>

In [25]:
test_pred_2 = mlp_2.predict(test_X)


In [26]:
accuracy_2 = accuracy_score(test_y, test_pred_2)
f"Overall Accuracy Model 2: {accuracy_2}"

'Overall Accuracy Model 2: 0.7595419847328244'

In [27]:
cm_2 = confusion_matrix(test_y, test_pred_2)
tn2, fp2, fn2, tp2 = cm_2.ravel()
survivor_accuracy_2 = tp2 / (tp2 + fn2)
fatality_accuracy_2 = tn2 / (tn2 + fp2)

In [28]:
f"Survivor Accuracy Model 2: {survivor_accuracy_2}"

'Survivor Accuracy Model 2: 0.6016949152542372'

In [29]:
f"Fatality Accuracy Model 2: {fatality_accuracy_2}"

'Fatality Accuracy Model 2: 0.8888888888888888'

<h2>Compare Perfomance with Random Forest</h2>

From HW3:
Random Forest - Percent survivors correctly predicted: 50.00%
Random Forest - Percent deaths correctly predicted: 95.14%

In [31]:
import pandas as pd

##Creating df with the performance from different models
nn_results = [
    {
        'Model': 'Neural Network',
        'Hidden Layers': '5, 4',
        'Overall Accuracy': f'{accuracy_1 * 100:.2f}%' ,
        'Survivor Accuracy': f'{survivor_accuracy_1 * 100:.2f}%',
        'Fatality Accuracy': f'{fatality_accuracy_1 * 100:.2f}%'
    },
    {
        'Model': 'Neural Network',
        'Hidden Layers': '10, 8',
        'Overall Accuracy': f'{accuracy_2 * 100:.2f}%' ,
        'Survivor Accuracy': f'{survivor_accuracy_2 * 100:.2f}%',
        'Fatality Accuracy': f'{fatality_accuracy_2 * 100:.2f}%'
    }
]

random_forest_results = {
    'Model': 'Random Forest',
    'Hidden Layers': 'N/A',
    'Overall Accuracy': 'N/A',  
    'Survivor Accuracy': '50%',  
    'Fatality Accuracy': '95.14%'  
}

# Combine results and convert to df
all_results = nn_results + [random_forest_results]

results_df = pd.DataFrame(all_results)

results_df


Unnamed: 0,Model,Hidden Layers,Overall Accuracy,Survivor Accuracy,Fatality Accuracy
0,Neural Network,"5, 4",73.66%,49.15%,93.75%
1,Neural Network,"10, 8",75.95%,60.17%,88.89%
2,Random Forest,,,50%,95.14%


As you can tell, the overall accuracy among the 2 neural networks is relatively comparable. With more hidden layers, the survivor accuracy was significantly increased over the model with less hidden layers. Although, with less hidden layers, the fatality accuracy was fairly higher than with more hidden layers. Despite the performane differences, the neural network with the most layers had the highest survivor accuracy, but the lowest fatality accuracy. The fatality accuracy was best in the random forest model but was essentially tied for worst in survivor accuracy with the first NN model.