In [1]:
import numpy as np
import pandas as pd
import argparse
import os.path as osp

import torch
import torch.nn.functional as func
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score


from models.mlp import MLP
from models.bin_mlp import binMLP
# from dataloaders.batch_dataloader import FCMatrixDataset
from dataloaders.dataloader import FCMatrixDataset

from torch.utils.data import Dataset, DataLoader, Subset


from utils import balanced_random_split_v2
from copy import deepcopy
from functools import partial


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def GCN_train(model, loader, train_dataset, batch_size, optimizer):
    model.train()

    loss_all = 0
    for data in loader:
        data = data.to(DEVICE)
        optimizer.zero_grad()
        output = model(data)
        loss = func.cross_entropy(output, data.y)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

def MLP_train(model, loader, train_dataset, batch_size, optimizer):
    model.train()

    loss_all = 0
    for data in loader:
        inputs = data[0].to(DEVICE)
        labels = data[1].to(DEVICE)

        optimizer.zero_grad()
        output = model(inputs)
       
        loss = func.cross_entropy(output, labels)
        loss.backward()
        loss_all += batch_size * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

def MLP_test(model, loader, val_dataset, batch_size):
    model.eval()

    pred = []
    label = []
    loss_all = 0
    for data in loader:
        inputs = data[0].to(DEVICE)
        labels = data[1].to(DEVICE)

        output = model(inputs)
        loss = func.cross_entropy(output, labels)
        loss_all += batch_size * loss.item()
        # pred.append(func.softmax(output, dim=1).max(dim=1)[1])
        _, predicted = torch.max(output, 1)
        pred.append(predicted)
        label.append(labels)

    y_pred = torch.cat(pred, dim=0).cpu().detach().numpy()
    y_true = torch.cat(label, dim=0).cpu().detach().numpy()
    tn, fp, fn, tp = confusion_matrix(y_pred, y_true).ravel()
    auc_score = roc_auc_score(y_true, y_pred)

    epoch_acc = (tn + tp) / (tn + tp + fn + fp)
    return auc_score, epoch_acc, loss_all / len(val_dataset)

In [51]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 208
dropout = 0.5140820967430961
epochs = 25
hidden_dim_ratio = 0.5
layer_size = 256
learning_rate = 0.0006834901970109743
n_layers = 1
weight_decay = 0.001
input_features = 1485
seed = 42

np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():  # GPU seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

ds = "data/csv/severe_rds.csv"
data_dir = "data/fetched/25751"
mrmr_features = None

hidden_dims  = [int(layer_size * hidden_dim_ratio*(2**i)) for i in range(1, n_layers+1)]

udi = data_dir.split("/")[-1]
data_dir = data_dir + "/raw"
labels = np.genfromtxt(ds)
labels = labels[1:, 1]
dataset = FCMatrixDataset(ds, data_dir, udi, None, mrmr=mrmr_features)


model = MLP(input_features, hidden_dims, 2, dropout).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=99)

train_val, test = next(skf.split(labels, labels))


train_val_dataset, test_dataset = Subset(dataset, train_val), Subset(dataset, test)
train_val_labels = labels[train_val]
train_val_index = np.arange(len(train_val_dataset))

train, val, _, _ = train_test_split(train_val_index, train_val_labels, test_size=0.11, shuffle=True, stratify=train_val_labels)
train_dataset, val_dataset = Subset(train_val_dataset, train), Subset(train_val_dataset, val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

min_v_loss = np.inf
best_val_acc = 0
for epoch in range(epochs):
    t_loss = MLP_train(model, train_loader, train_dataset, batch_size, optimizer)
    _, val_acc, v_loss = MLP_test(model, val_loader, val_dataset, batch_size)

    if min_v_loss > v_loss:
    # if best_val_acc < val_acc:
        min_v_loss = v_loss
        best_val_acc = val_acc
        best_model = deepcopy(model)
test_auc, test_acc, _ = MLP_test(best_model, test_loader, test_dataset, batch_size)
print('CV: {:03d}, Epoch: {:03d}, Val Loss: {:.5f}, Val BAC: {:.5f}, Test BAC: {:.5f}, TEST AUC: {:.5f}'.format(0 + 1, epoch + 1, min_v_loss, best_val_acc, test_auc,
                                        test_acc))
        



CV: 001, Epoch: 025, Val Loss: 9.48188, Val BAC: 0.69231, Test BAC: 0.53046, TEST AUC: 0.53390


In [52]:
import shap
import sklearn


X = np.array([np.array(i[0]) for i in dataset])
print(X.T.shape)
X = torch.tensor(np.array([np.array(i[0]) for i in dataset])).to(DEVICE)
# X = DataLoader(dataset, batch_size=48, shuffle=True)


explainer = shap.DeepExplainer(best_model, X)
shap_values = explainer.shap_values(X)

print(shap_values[0].shape)
print(shap_values[1].shape)

# visualize the first prediction's explanation
# shap.plots.waterfall(shap_values[0])
# shap.summary_plot(shap_values, X)

# shap.plots.bar(shap_values, max_display=10)

score = np.mean(np.abs(shap_values[0]), axis=0)
print(score)
idx = np.argmax(shap_values[0], axis=0)
print(idx)
score2 = shap_values[0][idx]
print(score2)

(1485, 236)
(1485, 2)
(1485, 2)
[0.00983604 0.00961599]
[1062  228]
[[ 0.06873799  0.01801712]
 [-0.01835313  0.08292013]]


In [71]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 208
dropout = 0.5140820967430961
epochs = 25
hidden_dim_ratio = 0.5
layer_size = 256
learning_rate = 0.0006834901970109743
n_layers = 1
weight_decay = 0.001
seed = 42

input_features = 55
mrmr_features = np.array([1140, 536, 223, 907, 1449, 499, 1293, 45, 135, 1440, 879, 1384, 1210, 1316, 122, 22, 492, 638, 765, 1027, 1464, 501, 1462, 395, 26, 1079, 70, 425, 1403, 1409, 1318, 886, 1459, 1448, 939, 1163, 547, 10, 413, 676, 131, 216, 942, 1136, 1386, 232, 1455, 1337, 814, 139, 392, 1376, 1382, 471, 656]
        )

np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():  # GPU seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

ds = "data/csv/severe_rds.csv"
data_dir = "data/fetched/25751"

hidden_dims  = [int(layer_size * hidden_dim_ratio*(2**i)) for i in range(1, n_layers+1)]

udi = data_dir.split("/")[-1]
data_dir = data_dir + "/raw"
labels = np.genfromtxt(ds)
labels = labels[1:, 1]
dataset = FCMatrixDataset(ds, data_dir, udi, mapping=None, mrmr=mrmr_features)


model = MLP(input_features, hidden_dims, 2, dropout).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=99)

train_val, test = next(skf.split(labels, labels))


train_val_dataset, test_dataset = Subset(dataset, train_val), Subset(dataset, test)
train_val_labels = labels[train_val]
train_val_index = np.arange(len(train_val_dataset))

train, val, _, _ = train_test_split(train_val_index, train_val_labels, test_size=0.11, shuffle=True, stratify=train_val_labels)
train_dataset, val_dataset = Subset(train_val_dataset, train), Subset(train_val_dataset, val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

min_v_loss = np.inf
best_val_acc = 0
for epoch in range(epochs):
    t_loss = MLP_train(model, train_loader, train_dataset, batch_size, optimizer)
    _, val_acc, v_loss = MLP_test(model, val_loader, val_dataset, batch_size)

    if min_v_loss > v_loss:
    # if best_val_acc < val_acc:
        min_v_loss = v_loss
        best_val_acc = val_acc
        best_model = deepcopy(model)
test_auc, test_acc, _ = MLP_test(best_model, test_loader, test_dataset, batch_size)
print('CV: {:03d}, Epoch: {:03d}, Val Loss: {:.5f}, Val BAC: {:.5f}, Test BAC: {:.5f}, TEST AUC: {:.5f}'.format(0 + 1, epoch + 1, min_v_loss, best_val_acc, test_auc,
                                        test_acc))
        



CV: 001, Epoch: 025, Val Loss: 8.33144, Val BAC: 0.61538, Test BAC: 0.83937, TEST AUC: 0.83898


In [72]:
import shap
import sklearn


X = torch.tensor(np.array([np.array(i[0]) for i in dataset])).to(DEVICE)
print(X.shape)
print(X[0])
# X = DataLoader(dataset, batch_size=48, shuffle=True)


explainer = shap.DeepExplainer(best_model, X)
print(best_model)
shap_values = explainer.shap_values(X)

# get index of most important feature from shap_values
score = np.mean(np.abs(shap_values[0]), axis=0)
print(score)
idx = np.argsort(shap_values[0], axis=0)[::-1][:5]
idx = idx[:,0]
score2 = shap_values[0][idx]
print(score2)

print(score2[:,0])

torch.Size([236, 55])
tensor([ 9.6208, -8.7138, -2.2577, -7.6152,  1.8930,  1.8710,  0.1609, -2.0138,
        -0.1155, -2.0444,  3.1087, -1.8756,  0.9936,  1.7838, -2.7766,  2.9736,
         2.6712, -7.5969,  0.1521, -2.2570, -1.7796,  1.7654,  0.9981,  3.2211,
        -1.9279,  4.5055, 15.2677,  2.9986, -3.1585,  4.1001,  3.7219,  0.8900,
        -3.2268, -3.5454,  7.5293,  4.6354,  6.9587, -2.0405, -7.3531, -0.7177,
        -3.7782, -4.8360, -9.2442,  7.7194,  2.5564,  5.3103,  0.5323, -1.9091,
         7.4302,  0.1335,  5.1687, -0.6371, -3.1070,  2.1771, -6.7912],
       device='cuda:0')
MLP(
  (nn): ModuleList(
    (0): Linear(in_features=55, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5140820967430961, inplace=False)
    (3): Linear(in_features=256, out_features=2, bias=True)
  )
)


In [69]:
idx[:,0]

array([ 3, 25, 15, 10, 52])