# Imports

In [27]:
import os
from warnings import filterwarnings
filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import torch
import torch.nn as nn

from captum.attr import IntegratedGradients
from captum.attr import LayerConductance
from captum.attr import NeuronConductance

%matplotlib inline

In [28]:
DATA_DIR = "data"
FILE_NAME = "titanic.csv"
FILE_PATH = os.path.join(DATA_DIR, FILE_NAME)
FILE_URL = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.csv"
RANDOM_SEED = 42

# Read Data

In [24]:
!wget -O $FILE_PATH $FILE_URL

df = pd.read_csv(FILE_PATH)
print(f"Records: {df.shape[0]}")
df.head()

--2024-02-17 14:46:31--  https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.csv
Resolving biostat.app.vumc.org (biostat.app.vumc.org)... 160.129.29.79
Connecting to biostat.app.vumc.org (biostat.app.vumc.org)|160.129.29.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 116752 (114K) [text/csv]
Saving to: ‘data/titanic.csv’


2024-02-17 14:46:31 (1.27 MB/s) - ‘data/titanic.csv’ saved [116752/116752]

Records: 1309


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# Preprocess

In [25]:
df_c = pd.concat(
    [
        df,
        pd.get_dummies(df["sex"]),
        pd.get_dummies(df["embarked"], prefix="embark"),
        pd.get_dummies(df["pclass"], prefix="class"),
    ],
    axis=1,
).copy()
df_c["age"] = df_c["age"].fillna(df_c["age"].mean())
df_c["fare"] = df_c["fare"].fillna(df_c["fare"].mean())
df_c = df_c.drop(
    [
        "name",
        "ticket",
        "cabin",
        "boat",
        "body",
        "home.dest",
        "sex",
        "embarked",
        "pclass",
    ],
    axis=1,
)
df_c.head()

Unnamed: 0,survived,age,sibsp,parch,fare,female,male,embark_C,embark_Q,embark_S,class_1,class_2,class_3
0,1,29.0,0,0,211.3375,True,False,False,False,True,True,False,False
1,1,0.92,1,2,151.55,False,True,False,False,True,True,False,False
2,0,2.0,1,2,151.55,True,False,False,False,True,True,False,False
3,0,30.0,1,2,151.55,False,True,False,False,True,True,False,False
4,0,25.0,1,2,151.55,True,False,False,False,True,True,False,False


In [26]:
# Set random seed for reproducibility.
np.random.seed(RANDOM_SEED)

# Convert features and labels to numpy arrays.
labels = df_c["survived"].to_numpy()
df_c = df_c.drop(["survived"], axis=1)
feature_names = list(df_c.columns)
data = df_c.to_numpy()

# Separate training and test sets using
train_indices = np.random.choice(len(labels), int(0.7 * len(labels)), replace=False)
test_indices = list(set(range(len(labels))) - set(train_indices))
train_features = data[train_indices]
train_labels = labels[train_indices]
test_features = data[test_indices]
test_labels = labels[test_indices]

print(f"Train Features: {train_features.shape}, Train Labels: {train_labels.shape}")
print(f"Test Features: {test_features.shape}, Test Labels: {test_labels.shape}")

Train Features: (916, 12), Train Labels: (916,)
Test Features: (393, 12), Test Labels: (393,)


# Define Model

In [42]:
torch.manual_seed(RANDOM_SEED)


class DNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = self._create_linear_layer((12, 12))
        self.relu1 = nn.ReLU()
        self.linear2 = self._create_linear_layer((12, 8))
        self.relu2 = nn.ReLU()
        self.linear3 = self._create_linear_layer((8, 2))
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        y = self.softmax(x)
        return y

    def _create_linear_layer(self, shape):
        layer = nn.Linear(shape[0], shape[1])
        nn.init.kaiming_uniform_(layer.weight)
        layer.weight.data, _ = layer.weight.data.sort()
        nn.init.zeros_(layer.bias)
        return layer


model = DNN()

# Train

In [None]:
N_EPOCHS = 200
LEARNING_RATE = 0.1
BATCH_SIZE = 16
LOG_INTERVAL = 20

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
