In [91]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import random
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Turn off warning messages
import warnings
warnings.filterwarnings("ignore")

In [92]:
# install potential dependencies
%pip install catboost



In [93]:
# import full dataset
full_set = pd.read_csv(Path("./Resources/Titanic_full_dataset.csv"))
full_set.head(2)

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,num_related,has_related,has_special_ticket
0,0,316,1,3,"Nilsson, Miss. Helmina Josefina",female,26.0,0,0,7.8542,,S,0,False,False
1,1,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,13.5,,S,0,False,False


In [94]:
# generic model evaluation method
def ModelEvaluation(model, X_train, y_train, X_test, y_test):
    # fit the model and predict on test and train datasets
    fitted = model.fit(X_train, y_train)
    train_predict = fitted.predict(X_train)
    test_predict = fitted.predict(X_test)

    # output performance reports
    print("Training Classification Report:")
    print(classification_report(y_train, train_predict))

    print("Testing Classification Report:")
    print(classification_report(y_test, test_predict))

# this will take a single full titanic-related dataset and
# create scaled train and test sets; a specific subset of columns
# can be specified to reduce the features; an alternate target column
# can be specified for further
def FormatTitanicData(data_frame, dummies, target='Survived', columns = []):
    # if not the default, get a subset of all columns
    if columns != []:
        curr_frame = data_frame[columns]
        # if a column is not in the passed set, do not dummy it
        dummies = [c for c in dummies if c in columns]
    else:
        curr_frame = data_frame

    # split target out from data
    X = curr_frame.drop(target, axis=1)
    y = curr_frame[target]

    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    # dummy categorical columns
    dummy_train = pd.get_dummies(X_train, columns=dummies)
    dummy_test = pd.get_dummies(X_test, columns=dummies)

    # d_train_cols = dummy_train.columns.to_list()
    # d_test_cols = dummy_test.columns.to_list()
    # if d_train_cols != d_test_cols

    # print(dummy_train.head(), dummy_test.head())

    # scale columns
    scaler = StandardScaler()
    train_scaler = scaler.fit(dummy_train)
    train_scaled = train_scaler.transform(dummy_train)
    test_scaler = scaler.fit(dummy_test)
    test_scaled = test_scaler.transform(dummy_test)

    return train_scaled, y_train, test_scaled, y_test

# general cleanup method to work with new format of full_set
def FinishCleanup(data_frame):
    data_frame.drop('Name', inplace=True, axis=1)
    data_frame.drop('PassengerId', inplace=True, axis=1)
    data_frame.drop('Cabin', inplace=True, axis=1)

In [95]:
# create train/test sets from full dataset
FinishCleanup(full_set)
dummies = ['Pclass', 'Embarked', 'Sex']
full_train_scaled, y_train, full_test_scaled, y_test = FormatTitanicData(full_set, dummies)

In [96]:
# create a Logistic Regression Model
lr_classifier = LogisticRegression(random_state = 1)

# fit the Logistic Regression Model
lr_model = lr_classifier.fit(full_train_scaled,y_train)

In [97]:
# Make predictons using the scaled data
lr_training_prediction = lr_model.predict(full_train_scaled)
lr_testing_prediction = lr_classifier.predict(full_test_scaled)

In [98]:
print("Confusion Matrix for training data")
print(confusion_matrix(y_train,lr_training_prediction))

print("Confusion Matrix for testing data")
print(confusion_matrix(y_test,lr_testing_prediction))

Confusion Matrix for training data
[[541  55]
 [ 81 302]]
Confusion Matrix for testing data
[[193  25]
 [ 29  80]]


In [99]:
print(classification_report(y_train,lr_training_prediction))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       596
           1       0.85      0.79      0.82       383

    accuracy                           0.86       979
   macro avg       0.86      0.85      0.85       979
weighted avg       0.86      0.86      0.86       979



In [100]:
print(classification_report(y_test,lr_testing_prediction))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       218
           1       0.76      0.73      0.75       109

    accuracy                           0.83       327
   macro avg       0.82      0.81      0.81       327
weighted avg       0.83      0.83      0.83       327



**K Nearest Neighbors Classifier**

In [101]:
from sklearn.neighbors import KNeighborsClassifier
# create a K Nearest Neighbors Classifier Model
knn = KNeighborsClassifier(n_neighbors=5)
knn_model = knn.fit(full_train_scaled,y_train)

In [102]:
# Make predictions
knn_training_prediction = knn_model.predict(full_train_scaled)
knn_testing_prediction = knn.predict(full_test_scaled)

In [103]:
print("Training Classification Report:")
print(classification_report(y_train,knn_training_prediction))

Training Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       596
           1       0.87      0.83      0.85       383

    accuracy                           0.88       979
   macro avg       0.88      0.87      0.88       979
weighted avg       0.88      0.88      0.88       979



In [104]:
print("Testing Classification Report:")
print(classification_report(y_test,knn_testing_prediction))

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       218
           1       0.78      0.71      0.74       109

    accuracy                           0.83       327
   macro avg       0.82      0.80      0.81       327
weighted avg       0.83      0.83      0.83       327



In [105]:
# K Nearest Neighbors Classifier Model
knn = KNeighborsClassifier(n_neighbors=5)
ModelEvaluation(knn, full_train_scaled, y_train, full_test_scaled, y_test)

Training Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       596
           1       0.87      0.83      0.85       383

    accuracy                           0.88       979
   macro avg       0.88      0.87      0.88       979
weighted avg       0.88      0.88      0.88       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       218
           1       0.78      0.71      0.74       109

    accuracy                           0.83       327
   macro avg       0.82      0.80      0.81       327
weighted avg       0.83      0.83      0.83       327



In [106]:
# Logistic Regression Model
lor = LogisticRegression(random_state = 1)
ModelEvaluation(lor, full_train_scaled, y_train, full_test_scaled, y_test)

Training Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       596
           1       0.85      0.79      0.82       383

    accuracy                           0.86       979
   macro avg       0.86      0.85      0.85       979
weighted avg       0.86      0.86      0.86       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       218
           1       0.76      0.73      0.75       109

    accuracy                           0.83       327
   macro avg       0.82      0.81      0.81       327
weighted avg       0.83      0.83      0.83       327



In [107]:
# Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)

ModelEvaluation(rf, full_train_scaled, y_train, full_test_scaled, y_test)

Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       596
           1       1.00      1.00      1.00       383

    accuracy                           1.00       979
   macro avg       1.00      1.00      1.00       979
weighted avg       1.00      1.00      1.00       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       218
           1       0.77      0.72      0.75       109

    accuracy                           0.83       327
   macro avg       0.82      0.81      0.81       327
weighted avg       0.83      0.83      0.83       327



In [108]:
# Decision Tree Classifier Model
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

ModelEvaluation(tree, full_train_scaled, y_train, full_test_scaled, y_test)

Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       596
           1       1.00      1.00      1.00       383

    accuracy                           1.00       979
   macro avg       1.00      1.00      1.00       979
weighted avg       1.00      1.00      1.00       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.84      0.85       218
           1       0.69      0.71      0.70       109

    accuracy                           0.80       327
   macro avg       0.77      0.77      0.77       327
weighted avg       0.80      0.80      0.80       327



In [109]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
ModelEvaluation(gnb, full_train_scaled, y_train, full_test_scaled, y_test)


Training Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       596
           1       0.80      0.80      0.80       383

    accuracy                           0.85       979
   macro avg       0.84      0.84      0.84       979
weighted avg       0.85      0.85      0.85       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       218
           1       0.75      0.75      0.75       109

    accuracy                           0.83       327
   macro avg       0.81      0.81      0.81       327
weighted avg       0.83      0.83      0.83       327



In [110]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
# evaluate numerous models from outside of class scope
gb = GradientBoostingClassifier(n_estimators=200)
ModelEvaluation(gb, full_train_scaled, y_train, full_test_scaled, y_test)

Training Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       596
           1       0.96      0.90      0.93       383

    accuracy                           0.94       979
   macro avg       0.95      0.94      0.94       979
weighted avg       0.94      0.94      0.94       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       218
           1       0.76      0.74      0.75       109

    accuracy                           0.83       327
   macro avg       0.81      0.81      0.81       327
weighted avg       0.83      0.83      0.83       327



In [111]:
bag = BaggingClassifier(n_estimators=200)
ModelEvaluation(bag, full_train_scaled, y_train, full_test_scaled, y_test)

Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       596
           1       1.00      1.00      1.00       383

    accuracy                           1.00       979
   macro avg       1.00      1.00      1.00       979
weighted avg       1.00      1.00      1.00       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       218
           1       0.77      0.69      0.72       109

    accuracy                           0.83       327
   macro avg       0.81      0.79      0.80       327
weighted avg       0.82      0.83      0.82       327



In [112]:
ada = AdaBoostClassifier(n_estimators=200)
ModelEvaluation(ada, full_train_scaled, y_train, full_test_scaled, y_test)

Training Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       596
           1       0.89      0.85      0.87       383

    accuracy                           0.90       979
   macro avg       0.90      0.89      0.90       979
weighted avg       0.90      0.90      0.90       979

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       218
           1       0.77      0.74      0.76       109

    accuracy                           0.84       327
   macro avg       0.82      0.82      0.82       327
weighted avg       0.84      0.84      0.84       327



In [113]:
from sklearn import tree
# evaluate decision tree performance
clf = tree.DecisionTreeClassifier()
clf = clf.fit(full_train_scaled, y_train)
prediction = clf.predict(full_test_scaled)

In [114]:
import graphviz
# create tree visualization
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("tree_graph")

'tree_graph.pdf'

In [115]:
# evaluate DNN performance
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features_num = len(full_train_scaled[0])
layer1_nodes = 60
layer2_nodes = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=layer1_nodes, input_dim=input_features_num, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=layer2_nodes, activation="sigmoid"))

nn.add(tf.keras.layers.Dense(units=layer2_nodes, activation="sigmoid"))

# Output layer; sigmoid chosen due to binary nature of target
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 60)                1020      
                                                                 
 dense_9 (Dense)             (None, 30)                1830      
                                                                 
 dense_10 (Dense)            (None, 30)                930       
                                                                 
 dense_11 (Dense)            (None, 1)                 31        
                                                                 
Total params: 3811 (14.89 KB)
Trainable params: 3811 (14.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [116]:
# compile, fit, evaluate DNN
nn.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
fit_model = nn.fit(full_train_scaled,y_train,epochs=200)
model_loss, model_accuracy = nn.evaluate(full_test_scaled, y_test)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [117]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize CatBoost classifier
model_catboost = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, loss_function='Logloss')

# Train the model
model_catboost.fit(full_train_scaled, y_train, eval_set=(full_test_scaled, y_test), verbose=False)

# Make predictions
y_pred = model_catboost.predict(full_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8409785932721713


In [118]:
# potential high-performance column sets from data evaluation stage
col_sets = [
    ['Pclass', 'Sex', 'Parch', 'Fare', 'Survived'],
    ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived'],
    ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'has_related', 'Survived'],
    ['Pclass', 'Sex', 'Age', 'SibSp', 'num_related', 'Survived'],
    ['Pclass', 'Sex', 'Fare', 'num_related', 'has_related', 'Survived'],
    ['Pclass', 'Sex', 'Age', 'Fare', 'Survived'],
    ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'num_related', 'has_related', 'has_special_ticket', 'Survived'],
    ['Pclass', 'Sex', 'Age', 'Parch', 'Embarked', 'num_related', 'has_special_ticket', 'Survived'],
    ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'num_related', 'has_related', 'Survived'],
    ['Pclass', 'Sex', 'Age', 'Parch', 'Fare', 'Embarked', 'Survived']
]

In [119]:
# try cat boost on these column sets
i = 0 # col set tracker
for curr_set in col_sets:
    # for each set, run the full process run above for the full set of cols
    dummies = ['Pclass', 'Embarked', 'Sex']
    X_tr,y_tr,X_te,y_te = FormatTitanicData(full_set, dummies, columns=curr_set)
    curr_cat = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, loss_function='Logloss')
    curr_cat.fit(X_tr, y_tr, eval_set=(X_te, y_te), verbose=False)
    y_predict = curr_cat.predict(X_te)
    curr_acc = accuracy_score(y_te, y_predict)
    print(f"Accuracy {i}: {curr_acc}")
    i = i + 1

Accuracy 0: 0.8501529051987767
Accuracy 1: 0.8501529051987767
Accuracy 2: 0.8409785932721713
Accuracy 3: 0.8440366972477065
Accuracy 4: 0.8501529051987767
Accuracy 5: 0.8379204892966361
Accuracy 6: 0.8440366972477065
Accuracy 7: 0.8409785932721713
Accuracy 8: 0.8532110091743119
Accuracy 9: 0.8440366972477065
