# setup

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

data = pd.read_csv("../clean_data/nafl/combined.large.nafl.csv")

In [None]:
# create the X and Y datasets

# data = data.drop(columns='DaysUntilFirstProgression')
# data = data.drop(columns='Outcome')
# data = data.drop(columns='Censored')

Y = data[['StudyID', 'DaysUntilFirstProgression']]
X = data.drop(columns=['DaysUntilFirstProgression', 'Outcome'])

X = X.set_index('StudyID')
Y = Y.set_index('StudyID')

In [None]:
# check if GPU is enabled
device = "cuda" if torch.cuda.is_available() else "cpu" # need to define device since python can use both cpu and gpu
print(f"Using {device} device")
print(f"Shape of X: {X.shape}. Shape of Y: {Y.shape}.")

In [None]:
# get all features that start with Lab
lab_feat = [feat for feat in X.columns if 'Lab' in feat]
numerical_feat = ['mean_BMI', 'last_BMI', 'FirstNAFL.Age.90']
numerical_feat.extend(lab_feat)

# establish the model

In [None]:
# curate the dataset
class MAFLDDataset(Dataset): # must contain init, len, and getitem
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# dataset = MAFLDDataset(X_torch, Y_torch)
# train_loader = DataLoader(dataset, batch_size=64, shuffle=True) # batch size 64

In [None]:
# define by subclassing nn.Module and initialize the neural network layers in __init__.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1), # no activation follows this layer
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

In [None]:
# create an instance of NeuralNetwork, move to device, print its structure
model = NeuralNetwork().to(device)

# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3) # start with this baseline learning rate

# run the untrained model on full dataset

In [None]:
num_epochs = 30 # typically between 10-50 for small datasets

for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        # move data to device
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        # Reshape labels if needed
        # batch_y = batch_y.unsqueeze(1)  # Make sure batch_y is (batch_size, 1)

        #initialize the gradients to zero
        optimizer.zero_grad() 

        # forward pass
        outputs = model(batch_X)

        # compute loss
        loss = loss_fn(outputs, batch_y)

        # gradient descent and update the weights
        loss.backward()
        optimizer.step()

print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

## evaluate performance on predicting binary outcome

In [None]:
# evaluate
X_input = torch.tensor(X_torch, device=device, dtype=torch.float32)
Y_hat = model(X_input)

predictions = (Y_hat >= 0.5).float()  # 0 if <0.5, 1 if >=0.5
print(f'Predicted classes: {predictions}')

In [None]:
# check performance

print(confusion_matrix(Y, predictions.cpu().detach().numpy()))
print(classification_report(Y, predictions.cpu().detach().numpy()))

# standardize the input data

In [None]:
# split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
# write a function to only standardize the numerical columns and reattach to the rest of the dataframe
scaler = StandardScaler()

def standardize_numerical(dataframe, num_feat=numerical_feat, training_set=True):
    """
    dataframe: Pandas DataFrame

    Returns: a processed DataFrame where the numerical features have been standardized and the categorical features remain the same.
    """
    if training_set:
        scaled = scaler.fit_transform(dataframe[num_feat])
    else:
        scaled = scaler.transform(dataframe[num_feat])
        
    scaled_df = pd.DataFrame(scaled, columns=num_feat, index=dataframe.index)
    cat = dataframe.drop(columns=num_feat)
    processed = pd.concat([scaled_df, cat], axis=1)

    return processed

In [None]:
# standardize our features
X_train_scaled = standardize_numerical(X_train, training_set=True)
X_test_scaled = standardize_numerical(X_test, training_set=False)

In [None]:
# convert into tensors and load into dataloaders
X_numpy_train = X_train_scaled.values.astype('float32') # turn into a numpy array
X_torch_train = torch.from_numpy(X_numpy_train)

# y_numpy_train = Y.values.astype(np.int64) # turn into a numpy array
# y_torch_train = torch.from_numpy(y_numpy_train)

# train_dataset = MAFLDDataset(X_torch_train, y_torch_train)
# train_data = DataLoader(train_dataset, shuffle=True, batch_size=64)

# scale the y vector as well
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.to_numpy().reshape(-1, 1))
y_torch_train_scaled = torch.from_numpy(y_train_scaled)

train_dataset_scaledy = MAFLDDataset(X_torch_train, y_torch_train_scaled)
train_data_scaledy = DataLoader(train_dataset_scaledy, shuffle=True, batch_size=64)

In [None]:
# convert into tensors and load into dataloaders
X_numpy_test = X_test_scaled.values.astype('float32') # turn into a numpy array
X_torch_test = torch.from_numpy(X_numpy_test)

# train model on train/test split

In [None]:
# train model for 30 epochs
model.train()
num_epochs = 50 # typically between 10-50 for small datasets

for epoch in range(num_epochs):
    print(f'Training Epoch [{epoch+1}/{num_epochs}]')
    for batch_X, batch_y in train_data_scaledy:
        # move data to device
        batch_X = torch.tensor(batch_X).to(device)
        batch_y = torch.tensor(batch_y).to(device)

        #initialize the gradients to zero
        optimizer.zero_grad() 

        # forward pass
        outputs = model(batch_X)

        # compute loss
        loss = loss_fn(outputs, batch_y)

        # gradient descent and update the weights
        loss.backward()
        optimizer.step()

print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
y_test.shape

In [None]:
X_torch_test.shape

In [None]:
# run model on test data
model.eval()
y_hat_test_scaled = model(X_torch_test.float().to(device)) # run on testing data
y_hat_test_scaled = y_hat_test_scaled.cpu().detach()
y_pred_rescaled = scaler_y.inverse_transform(y_hat_test_scaled.numpy())

# evaluate via MSE
print(mean_squared_error(y_test, y_pred_rescaled))
# old: 242657.46875

In [None]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred_rescaled))
# old: 369.0030822753906

In [None]:
y_test.max()

# save the model weights

In [None]:
torch.save(model.state_dict(), "numeric_nn_scaled_x_and_y.pth")

# shap scores

In [None]:
import shap
def model_wrapper(array):
    ''' Wrapper around the torch model() function to pass into shap explainer.
        array: Pandas DataFrame
        Returns: torch
    '''
    if isinstance(array, pd.DataFrame):
        array = array.to_numpy()
    array = torch.tensor(array.astype('float32')).to(device)
    # array = array.to(device)
    model.eval()
    y_hat_test_scaled = model(array)
    y_hat_test_scaled = y_hat_test_scaled.cpu().detach()
    y_pred_rescaled = scaler_y.inverse_transform(y_hat_test_scaled.numpy())
    return y_pred_rescaled

feature_names = X.columns

In [None]:
explainer = shap.DeepExplainer(model, torch.tensor(X_train_scaled.to_numpy().astype(np.float32)).to(device))
shap_values = explainer.shap_values(torch.tensor(X_test_scaled.to_numpy().astype(np.float32)).to(device))

In [None]:
shap_values.shape

In [None]:
shap_values_squeezed = shap_values.squeeze(-1)

In [None]:
print([x for x in X.columns if "Gender_Legal" in x])

In [None]:
# excluded_features = ['DaysUntilFirstProgression', 'Gender_Legal_Sex_Unknown-U']  # Replace with your actual feature names

# Get new feature names list that excludes the specified feature
feature_names_subset = [name for name in feature_names if name not in excluded_features]

# Get the indices of the features to keep
indices_to_keep = [i for i, name in enumerate(feature_names) if name not in excluded_features]

# Slice the SHAP values array (assuming it's already squeezed to shape (n_samples, n_features))
shap_values_subset = shap_values_squeezed[:, indices_to_keep]
X_test_subset = X_test[feature_names_subset]

In [None]:
# shap.summary_plot(shap_values_subset, X_test_subset, feature_names=feature_names_subset)
import matplotlib.pyplot as plt
shap.summary_plot(shap_values_squeezed, X_test, feature_names=X.columns, show=False)
plt.title('Time-to-event NN Feature Importance')
# plt.savefig('results/shap_beeswarm_plot.png', format='png', dpi=300)

In [None]:
X_test

In [None]:
feature_names[:10]

In [None]:
# plot with human readable names
features = ['Reticulocyte Blood Test', 
'Prealbumin Blood Test', 
'Vitamin B12 Blood Test', 
'Alkaline Phosphatase Blood Test', 
'Encounter for immunization', 
'White Blood Cell Blood Test', 
'Cholesterol Test', 
'Iopamidol 76 \% Intravenous Solution', 
'Calculated low-density Lipoprotein Blood Test', 
'Triglyceride Lab Test',
'Ondansetron 8 mg Disintegrating Tablet',
'Urine Volume Lab Test',
'Unsaturated Iron Binding Blood Test',
'Mean Corpuscular Hemoglobin Concentration Blood Test',
'Hyperlipidemia, unspecified',
'Very Low-density Lipoprotein Blood Test',
'Iron Saturation Blood Test',
'Flovent Hfa 110 mcg/Actuation Aerosol Inhaler',
'Low-density Lipoprotein Blood Test',
'Red Blood Cell Blood Test']
shap_values_top = shap_values_squeezed[:, :20]
x_test_top = X_test.iloc[:, :20]
features_top = features[:20]
# features_top = X.columns[:20]

plt.figure(figsize=(18, 7))
shap.summary_plot(shap_values_top, x_test_top, feature_names=features_top, show=False, max_display=10)
plt.xlabel('SHAP value (impact on model output)')
plt.gca().tick_params(axis='y', labelsize=8)
plt.title('Time-to-event Neural Network Influential Features')
plt.tight_layout()
# plt.savefig('results/quant_nn_shap.png', dpi=300)

In [None]:
# Compute mean absolute SHAP values per feature
shap_mean_abs = np.abs(shap_values_squeezed).mean(axis=0)

# Create a Series for easy sorting
shap_series = pd.Series(shap_mean_abs, index=X.columns)

# Get top 20 feature names
top20_features = shap_series.sort_values(ascending=False).head(20).index.tolist()
top20_features

In [None]:
# plt.figure(figsize=(16, 6))
shap.summary_plot(shap_values_squeezed, X_test, plot_type="bar", feature_names=feature_names, show=False)
plt.title('Time-to-event NN Feature Importance')
plt.xlabel('mean(|SHAP value|) (average impact on output)')
plt.savefig('results/shap_bar_plot.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
shap.plots.violin(shap_values_squeezed, X_test, plot_type="layered_violin", feature_names=feature_names, max_display=20, show=False)
plt.title('Time-to-event NN Feature Importance')
plt.savefig('results/shap_violin_plot.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
shap_values_squeezed.shape

In [None]:
X_test.shape

In [None]:
# rank features by mean absolute SHAP value
# Calculate mean absolute SHAP value for each feature
mean_abs_shap = np.abs(shap_values_squeezed).mean(axis=0)

# Get indices of top features
top_indices = np.argsort(mean_abs_shap)[::-1]  # descending order

# Get corresponding feature names and importance values
top_features = [(feature_names[i], mean_abs_shap[i]) for i in top_indices]

In [None]:
top_features

In [None]:
top_features_names = [x[0] for x in top_features[:10]]

In [None]:
top_features_names

In [None]:
# 1. Compute the mean SHAP value for each feature (not absolute)
mean_shap = shap_values_squeezed.mean(axis=0)

# 2. Get indices of top 10 positive and top 10 negative impact features
top_positive_indices = np.argsort(mean_shap)[-10:]  # most positive
top_negative_indices = np.argsort(mean_shap)[:10]   # most negative

# 3. Retrieve feature names and their SHAP values
top_positive_features = [(feature_names[i], mean_shap[i]) for i in reversed(top_positive_indices)]
top_negative_features = [(feature_names[i], mean_shap[i]) for i in top_negative_indices]

In [None]:
top_positive_features

In [None]:
top_negative_features

In [None]:
pos_names = [x[0] for x in top_positive_features]
pos_names

In [None]:
neg_names = [x[0] for x in top_negative_features]
neg_names

In [None]:
import pickle
filename = 'results/quant_nn_shap_values.pkl'
with open(filename, 'wb') as file:
    # Use pickle.dump to serialize and write the data
    pickle.dump(shap_values_squeezed, file)

In [None]:
file_path = 'results/quant_nn_shap_values.pkl'
with open(file_path, 'rb') as file:
    shap_values_squeezed = pickle.load(file)

In [None]:
# rank features by mean absolute SHAP value
# Calculate mean absolute SHAP value for each feature
mean_abs_shap = np.abs(shap_values_squeezed).mean(axis=0)

# Get indices of top features
top_indices = np.argsort(mean_abs_shap)[::-1]  # descending order

# Get corresponding feature names and importance values
top_features = [(feature_names[i], mean_abs_shap[i]) for i in top_indices]

In [None]:
top_features

In [None]:
X.columns

In [None]:
# plot with human readable names
top_labels = ['Reticulocyte Blood Test', 
'Prealbumin Blood Test', 
'Vitamin B12 Blood Test', 
'Alkaline Phosphatase Blood Test', 
'Encounter for immunization', 
'White Blood Cell Blood Test', 
'Cholesterol Test', 
'Iopamidol 76% Intravenous Solution', 
'Calculated low-density Lipoprotein Blood Test', 
'Triglyceride Lab Test',
'Ondansetron 8 mg Disintegrating Tablet',
'Urine Volume Lab Test',
'Unsaturated Iron Binding Blood Test',
'Mean Corpuscular Hemoglobin Concentration Blood Test',
'Hyperlipidemia, unspecified',
'Very Low-density Lipoprotein Blood Test',
'Iron Saturation Blood Test',
'Flovent Hfa 110 mcg/Actuation Aerosol Inhaler',
'Low-density Lipoprotein Blood Test',
'Red Blood Cell Blood Test']
# shap_values_top = shap_values_squeezed[:, :20]
# x_test_top = X_test.iloc[:, :20]
# # features_top = features[:20]
# features_top = X.columns[:20]

# plt.figure(figsize=(18, 7))
# shap.summary_plot(shap_values_top, x_test_top, feature_names=features_top, show=False, max_display=10)
# plt.xlabel('SHAP value (impact on model output)')
# plt.gca().tick_params(axis='y', labelsize=8)
# plt.title('Time-to-event Neural Network Influential Features')
# plt.tight_layout()
# plt.savefig('results/quant_nn_shap.png', dpi=300)

In [None]:
shap_importance = np.abs(shap_values_squeezed).mean(axis=0)

# Get the ordering of top features (same as used internally by SHAP)
feature_order = np.argsort(shap_importance)[::-1]  # descending order
max_display = 10  # SHAP default
top_indices = feature_order[:max_display]
top_indices

In [None]:
X.iloc[:, 29]

In [None]:
shap.summary_plot(
    shap_values_squeezed[:, top_indices],
    X_test.iloc[:, top_indices],
    feature_names=top_labels,
    show=False
)
plt.xlabel('SHAP value (impact on model output)')
plt.gca().tick_params(axis='y', labelsize=11)
plt.title('Time-to-event Neural Network Influential Features')
plt.tight_layout()
plt.savefig('results/quant_nn_shap.png', dpi=300)

## sanity check: training and testing on unscaled data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_torch, Y_torch, test_size=0.3, random_state=42)

In [None]:
train_dataset = MAFLDDataset(X_train, y_train)
train_data = DataLoader(train_dataset, shuffle=True, batch_size=64)

In [None]:
num_epochs = 30 # typically between 10-50 for small datasets

for epoch in range(num_epochs):
    for batch_X, batch_y in train_data:
        # move data to device
        batch_X = torch.tensor(batch_X).to(device)
        batch_y = torch.tensor(batch_y).to(device)

        #initialize the gradients to zero
        optimizer.zero_grad() 

        # forward pass
        outputs = model(batch_X)

        # compute loss
        loss = loss_fn(outputs, batch_y)

        # gradient descent and update the weights
        loss.backward()
        optimizer.step()

print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
# run model on test data
Y_hat_test = model(X_test.float().to(device)) # run on testing data

# evaluate via MSE
print(mean_squared_error(y_test, Y_hat_test.cpu().detach().numpy()))

In [None]:
Y_hat_test

# tweaking model design

In [None]:
# original model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1), # no activation follows this layer
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

In [None]:
# adding dropout, switching to LeakyReLU, adding batchnorm layers
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 256),
            nn.LeakyReLU(),

            nn.Linear(256, 128),
            nn.Dropout(0.2),
            nn.LeakyReLU(),

            nn.Linear(128, 64),
            nn.LeakyReLU(),

            nn.Linear(64, 1)
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

In [None]:
# attempting skip connections
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.relu(x + self.block(x))  # skip connection

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, 256)

        self.resblock1 = ResidualBlock(256)
        self.resblock2 = ResidualBlock(256)
        self.resblock3 = ResidualBlock(256)

        self.output_layer = nn.Linear(256, 1)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.resblock1(x)
        x = self.resblock2(x)
        x = self.resblock3(x)
        return self.output_layer(x)

In [None]:
# creating an experiment manager that can test run the various edits we want to make
from itertools import product

search_space = {
    "hidden_sizes": [[512, 128], [1024, 512, 128]],
    "activation": ["relu", "leaky_relu"],
    "dropout": [0.0, 0.2],
    "use_batchnorm": [True, False],
    "learning_rate": [1e-3, 1e-4]
}

# Create list of all combinations
all_configs = [dict(zip(search_space.keys(), values)) for values in product(*search_space.values())]

In [None]:
import torch.nn as nn

def get_activation(name):
    return {
        "relu": nn.ReLU(),
        "leaky_relu": nn.LeakyReLU(0.01),
    }[name]

class FlexibleNetwork(nn.Module):
    def __init__(self, input_dim, hidden_sizes, activation, dropout, use_batchnorm):
        super().__init__()
        layers = []
        last_dim = input_dim
        for h in hidden_sizes: # for each layer, construct linear + batchnorm + dropout
            layers.append(nn.Linear(last_dim, h))
            if use_batchnorm:
                layers.append(nn.BatchNorm1d(h))
            layers.append(get_activation(activation))
            if dropout > 0.0:
                layers.append(nn.Dropout(dropout))
            last_dim = h
        layers.append(nn.Linear(last_dim, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


In [None]:
# training loop and evaluator
def train_model(model, train_loader, val_loader, lr, device="cpu", epochs=10):
    model.to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    val_losses = []

    for epoch in range(epochs):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            loss = loss_fn(model(x).squeeze(), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Evaluate
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                val_loss += loss_fn(model(x).squeeze(), y).item()
        val_losses.append(val_loss / len(val_loader))
    return val_losses[-1]  # return final validation loss


In [None]:
# run experiments
def run_experiments(X_train, y_train, X_val, y_val):
    from torch.utils.data import DataLoader, TensorDataset

    results = []
    for config in all_configs:
        print(f"Running config: {config}")
        model = FlexibleNetwork(
            input_dim=X_train.shape[1],
            hidden_sizes=config["hidden_sizes"],
            activation=config["activation"],
            dropout=config["dropout"],
            use_batchnorm=config["use_batchnorm"]
        )

        train_loader = DataLoader(MAFLDDataset(X_train, y_train), batch_size=64, shuffle=True)
        val_loader = DataLoader(MAFLDDataset(X_val, y_val), batch_size=64)

        val_loss = train_model(model, train_loader, val_loader, lr=config["learning_rate"])
        results.append((config, val_loss))
        print(f"Validation loss: {val_loss:.4f}")

    return sorted(results, key=lambda x: x[1])  # sorted by val loss


In [None]:
run_experiments(X_train, y_train, X_test, y_test)

In [None]:
# split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_torch, Y_torch, test_size=0.3, random_state=42)

train_dataset = MAFLDDataset(X_train, y_train)
train_data = DataLoader(train_dataset, shuffle=True, batch_size=64)

In [None]:
X_train.shape[1]

In [None]:
y_train.shape