In [2]:
import numpy as np
import json
import matplotlib.pyplot as plt

In [4]:
data_path = '/content/drive/MyDrive/mimic-iii-clinical-database-1.4/embeddings/data.json'
data = []
with open(data_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

label_path = '/content/drive/MyDrive/mimic-iii-clinical-database-1.4/embeddings/target_jm.json'
labels = []        
with open(label_path, 'r') as f:
    for line in f:
        labels.append(json.loads(line))

In [5]:
label = []
corr_data = []
for i in range(len(data)):
  key = list(data[i].keys())[0]
  label.append(labels[i][key])
  corr_data.append(data[i][key])

In [6]:
corr_data = np.array(corr_data)
new_data = corr_data[:,2:,:]
new_data.shape

(10004, 10, 768)

In [7]:
corr_data = new_data

In [8]:
num_patients = len(corr_data)
num_features = len(corr_data[0])
embedding_size = len(corr_data[0][0])
print(num_patients)
print(num_features)
print(embedding_size)

10004
10
768


In [9]:
tabular_data = np.array(corr_data).reshape(num_patients, -1)
tabular_data.shape

(10004, 7680)

In [10]:
label = np.array(label)
label_counts =np.unique(label, return_counts=True)
num_ones = np.count_nonzero(label)
num_zeroes = len(label) - num_ones
print("Survived:", num_zeroes)
print("Died:", num_ones)
print(f'Ratio(Died/Survived) = {num_ones/num_zeroes}')

Survived: 8727
Died: 1277
Ratio(Died/Survived) = 0.14632748940071044


In [11]:
!pip install pytorch-tabnet -q

In [12]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class LDAMLoss(nn.Module):

    def __init__(self, cls_num_list, max_m=0.5, weight=None, s=30):
        super(LDAMLoss, self).__init__()
        m_list = 1.0 / np.sqrt(np.sqrt(cls_num_list))
        m_list = m_list * (max_m / np.max(m_list))
        m_list = torch.cuda.FloatTensor(m_list)
        self.m_list = m_list
        assert s > 0
        self.s = s
        self.weight = weight

    def forward(self, x, target):
        index = torch.zeros_like(x, dtype=torch.uint8)
        index.scatter_(1, target.data.view(-1, 1), 1)

        index_float = index.type(torch.cuda.FloatTensor)
        batch_m = torch.matmul(self.m_list[None, :], index_float.transpose(0,1))
        batch_m = batch_m.view((-1, 1))
        x_m = x - batch_m

        output = torch.where(index, x_m, x)
        return F.cross_entropy(self.s*output, target, weight=self.weight)


In [13]:
torch.tensor((num_ones, num_zeroes))/1e4

tensor([0.1277, 0.8727])

In [14]:
from pytorch_tabnet.tab_model import TabNetClassifier
max_epochs = 1000

from sklearn.model_selection import train_test_split

# Splitting data into training and testing sets (70% train, 30% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(tabular_data, label, test_size=0.15, random_state=42)

# Splitting training and validation sets from the remaining data (70% train, 15% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.18, random_state=42)

y_train_int = y_train.astype(int)
cls_num_list = np.bincount(y_train_int)

loss_fn = nn.CrossEntropyLoss(weight=torch.tensor((num_ones, num_zeroes)).to('cuda')/1e4)

clf = TabNetClassifier(n_d=50, n_a=50, n_steps=10, n_independent=5, n_shared=5, device_name = 'cuda')
save_history = []

# Fit the model
clf.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_train,y_train), (X_val, y_val)],
    eval_name=['train','valid'],
    max_epochs=max_epochs,
    eval_metric=['balanced_accuracy', 'auc'],
    loss_fn = loss_fn,
    patience = 0
)
save_history.append(clf.history["valid_balanced_accuracy"])



epoch 0  | loss: 1.72685 | train_balanced_accuracy: 0.50181 | train_auc: 0.50278 | valid_balanced_accuracy: 0.49585 | valid_auc: 0.50047 |  0:00:08s
epoch 1  | loss: 1.69148 | train_balanced_accuracy: 0.5     | train_auc: 0.57713 | valid_balanced_accuracy: 0.5     | valid_auc: 0.54518 |  0:00:15s
epoch 2  | loss: 1.54697 | train_balanced_accuracy: 0.49975 | train_auc: 0.51319 | valid_balanced_accuracy: 0.49925 | valid_auc: 0.50324 |  0:00:22s
epoch 3  | loss: 1.43353 | train_balanced_accuracy: 0.49873 | train_auc: 0.44768 | valid_balanced_accuracy: 0.49641 | valid_auc: 0.44116 |  0:00:29s
epoch 4  | loss: 1.31639 | train_balanced_accuracy: 0.51107 | train_auc: 0.53752 | valid_balanced_accuracy: 0.49294 | valid_auc: 0.53332 |  0:00:36s
epoch 5  | loss: 1.17386 | train_balanced_accuracy: 0.50028 | train_auc: 0.4536  | valid_balanced_accuracy: 0.49791 | valid_auc: 0.48292 |  0:00:43s
epoch 6  | loss: 0.98071 | train_balanced_accuracy: 0.52896 | train_auc: 0.53003 | valid_balanced_accuracy

KeyboardInterrupt: 

In [None]:
import gc
import torch

# Assuming `obj` is the object consuming GPU memory
obj = None

# Collect garbage
gc.collect()

# Empty PyTorch cache
torch.cuda.empty_cache()

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
max_epochs = 400

from sklearn.model_selection import train_test_split

# Splitting data into training and testing sets (70% train, 30% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(tabular_data, label, test_size=0.15, random_state=42)

# Splitting training and validation sets from the remaining data (70% train, 15% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.18, random_state=42)

y_train_int = y_train.astype(int)
cls_num_list = np.bincount(y_train_int)

loss_fn = nn.CrossEntropyLoss(weight=torch.tensor((num_ones, num_zeroes)).to('cuda')/1e4)

clf = TabNetClassifier(n_d=50, n_a=50, n_steps=10, n_independent=5, n_shared=5, device_name = 'cuda')
save_history = []

# Fit the model
clf.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_train,y_train), (X_val, y_val)],
    eval_name=['train','valid'],
    max_epochs=max_epochs,
    eval_metric=['balanced_accuracy', 'auc'],
    loss_fn = loss_fn,
    patience = 0
)
save_history.append(clf.history["valid_balanced_accuracy"])



epoch 0  | loss: 1.72685 | train_balanced_accuracy: 0.50181 | train_auc: 0.50278 | valid_balanced_accuracy: 0.49585 | valid_auc: 0.50047 |  0:00:06s
epoch 1  | loss: 1.69148 | train_balanced_accuracy: 0.5     | train_auc: 0.57713 | valid_balanced_accuracy: 0.5     | valid_auc: 0.54518 |  0:00:13s
epoch 2  | loss: 1.54697 | train_balanced_accuracy: 0.49975 | train_auc: 0.51319 | valid_balanced_accuracy: 0.49925 | valid_auc: 0.50324 |  0:00:20s
epoch 3  | loss: 1.43353 | train_balanced_accuracy: 0.49873 | train_auc: 0.44768 | valid_balanced_accuracy: 0.49641 | valid_auc: 0.44116 |  0:00:27s
epoch 4  | loss: 1.31639 | train_balanced_accuracy: 0.51107 | train_auc: 0.53752 | valid_balanced_accuracy: 0.49294 | valid_auc: 0.53332 |  0:00:34s
epoch 5  | loss: 1.17386 | train_balanced_accuracy: 0.50028 | train_auc: 0.4536  | valid_balanced_accuracy: 0.49791 | valid_auc: 0.48292 |  0:00:41s
epoch 6  | loss: 0.98071 | train_balanced_accuracy: 0.52896 | train_auc: 0.53003 | valid_balanced_accuracy

In [None]:
import plotly.express as px


explainability_matrix , masks = clf.explain(X_val)

exp_mat = np.sum(explainability_matrix.reshape((X_val.shape[0], num_features, embedding_size)), axis=2)

# Normalize the importance by sample
normalized_explain_mat = np.divide(exp_mat, exp_mat.sum(axis=1).reshape(-1, 1)+1e-8)

# Add prediction to better understand correlation between features and predictions
val_preds = clf.predict(X_val)

# explain_and_preds = np.hstack([normalized_explain_mat, val_preds.reshape(-1, 1)])

features = ['GENDER', 'AGE', 'RELIGION', 'ETHNICITY',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DIAGNOSIS', 'PROCEDURE',
       'CATEGORY', 'DESCRIPTION', 'TEXT']

px.imshow(normalized_explain_mat,
          labels=dict(x="Features", y="Samples", color="Importance"),
#           x=features+["prediction"],
          x=features,
          title="Sample wise feature importance",
          color_continuous_scale='Jet',
          height=1000

In [None]:
p = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, p)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])

fig, ax = plt.subplots(figsize=(8, 6))
disp.plot(ax=ax, cmap='Blues', xticks_rotation='horizontal')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
# plt.tight_layout()
# plt.show()
plt.savefig('cm_test.png')

In [None]:
print("Class-wise accuracy on test:\nSurvival\t\tMorbidity")
print('\t'.join([str(i) for i in np.array((cm[0][0], cm[1][1]))/np.sum(cm, axis=1)]))

In [None]:
cm = confusion_matrix(y_val, clf.predict(X_val))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])

fig, ax = plt.subplots(figsize=(8, 6))
disp.plot(ax=ax, cmap='Blues', xticks_rotation='horizontal')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
# plt.tight_layout()
# plt.show()
plt.savefig('cm_val.png')

In [None]:
print("Class-wise accuracy on valid:\nSurvival\t\tMorbidity")
print('\t'.join([str(i) for i in np.array((cm[0][0], cm[1][1]))/np.sum(cm, axis=1)]))

In [None]:
plt.plot(clf.history['train_balanced_accuracy'], label="Training")
plt.plot(clf.history['valid_balanced_accuracy'], label="Validation")
plt.grid()
plt.xlabel("Epochs")
plt.ylabel("Balanced Acc")
plt.legend()
plt.title("Balanced accuracy vs epochs")
# plt.show()
plt.savefig('bacc.png')

In [None]:
plt.plot(clf.history['train_auc'], label="Training")
plt.plot(clf.history['valid_auc'], label="Validation")
plt.grid()
plt.xlabel("Epochs")
plt.ylabel("AUROC")
plt.legend()
plt.title("AUROC vs epochs")
# plt.show()
plt.savefig('auroc.png')

In [None]:
from sklearn.metrics import balanced_accuracy_score, roc_auc_score

balanced_acc = balanced_accuracy_score(y_test, p)
auc_roc = roc_auc_score(y_test, p)

print(f"BACC: {balanced_acc}\nAUROC: {auc_roc}")

In [None]:
print(f"BACC: {max(clf.history['valid_balanced_accuracy'])}\nAUROC: {max(clf.history['valid_auc'])}")

In [None]:
# plot losses
plt.plot(clf.history['loss'])
plt.grid()
plt.xlabel("Epochs")
plt.ylabel("Weighted Cross Entropy")
plt.title("Loss vs epochs")
# plt.show()
plt.savefig('loss.png')

In [None]:
feat_imp = np.sum(clf.feature_importances_.reshape(num_features, embedding_size), axis=1)
columns = ['GENDER', 'AGE', 'RELIGION', 'ETHNICITY',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DIAGNOSIS', 'PROCEDURE',
       'CATEGORY', 'DESCRIPTION', 'TEXT']

In [None]:
sorted_indices = np.argsort(feat_imp)[::-1]  # Get indices to sort in descending order
sorted_feat_imp = feat_imp[sorted_indices]
sorted_feature_names = [columns[i] for i in sorted_indices]

# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_feat_imp)), sorted_feat_imp, align='center', color='skyblue')
plt.xticks(range(len(sorted_feat_imp)), sorted_feature_names, rotation=45, ha='right', fontsize=12)
plt.xlabel('Feature', fontsize=14)
plt.ylabel('Importance', fontsize=14)
plt.title('Feature Importance', fontsize=16)
# plt.tight_layout()
# plt.show()
plt.grid()
plt.savefig('feat_imp.png')

In [None]:
import numpy as np
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import train_test_split

# Assuming embeddings and t are already NumPy arrays
# Ensure that embeddings and t have appropriate shapes and formats



los = np.array(los)
# Define max_epochs
max_epochs = 500

# Splitting data into training and testing sets (85% train, 15% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(tabular_data, los, test_size=0.15, random_state=42)

# Splitting training and validation sets from the remaining data (70% train, 15% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.18, random_state=42)
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)


save_history = []

# Initialize TabNetRegressor
clf = TabNetRegressor(n_d=30, n_a=30, n_steps=7, n_independent=3,device_name = 'cuda')


# Fit the model
clf.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_train,y_train),(X_val,y_val)],
    eval_name=['train','valid'],
    max_epochs=max_epochs,
    eval_metric=['rmse','mae'],
    patience = 0
)
save_history.append(clf.history["valid_rmse"])

In [None]:
# plot losses
plt.grid(True)
plt.plot(clf.history['loss'])

In [None]:
# plot rmse
plt.grid(True)
plt.plot(clf.history['train_rmse'])
plt.plot(clf.history['valid_rmse'])

In [None]:
# plot rmse
plt.grid(True)
plt.plot(clf.history['train_mae'])
plt.plot(clf.history['valid_mae'])

In [None]:
# Deprecated : best model is automatically loaded at end of fit
# clf.load_best_model()

from sklearn.metrics import mean_squared_error,mean_absolute_error
preds = clf.predict(X_test)

y_true = y_test

y_preds = np.array(preds)
y_true = np.array(y_true)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_preds))

# Calculate MAE
mae = mean_absolute_error(y_true, y_preds)

print("RMSE:", rmse)
print("MAE:", mae)


In [None]:
# Plot y_true against y_preds with different colors
plt.figure(figsize=(8, 6))

# Scatter plot for y_true (blue color)
plt.scatter(y_true, y_true, color='blue', label='True Values', alpha=0.5)

# Scatter plot for y_preds (green color)
plt.scatter(y_true, y_preds, color='green', label='Predicted Values', alpha=0.5)

# Plot y=x line for reference (red color)
#plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', label='Perfect Prediction Line')

# Customize plot elements
plt.xlabel('True Values')
plt.ylabel('Values')
plt.title('True vs. Predicted Values')
plt.grid(True)
plt.legend()  # Show legend with labels

plt.show()

In [None]:
clf_feat = clf.feature_importances_

In [None]:
x= clf_feat.reshape((embedding_size, -1))

In [None]:
columns = ['HADM_ID', 'SUBJECT_ID', 'GENDER', 'AGE', 'RELIGION', 'ETHNICITY',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DIAGNOSIS', 'PROCEDURE',
       'CATEGORY', 'DESCRIPTION', 'TEXT', 'DISCHARGE_LOCATION']

In [None]:
feat_imp = np.sum(x, axis =0)
feature_names = columns.values

In [None]:
import numpy as np
import matplotlib.pyplot as plt



# Sort feature importance in descending order
sorted_indices = np.argsort(feat_imp)[::-1]  # Get indices to sort in descending order
sorted_feat_imp = feat_imp[sorted_indices]
sorted_feature_names = [feature_names[i] for i in sorted_indices]

# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_feat_imp)), sorted_feat_imp, align='center', color='skyblue')
plt.xticks(range(len(sorted_feat_imp)), sorted_feature_names, rotation=45, ha='right', fontsize=12)
plt.xlabel('Feature', fontsize=14)
plt.ylabel('Importance', fontsize=14)
plt.title('Feature Importance', fontsize=16)
plt.tight_layout()

# Show plot
plt.show()


In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor
from matplotlib import pyplot as plt
import numpy as np

# Assuming X_test, clf, and masks are defined and valid

# Explanation and masks generation
explain_matrix, masks = clf.explain(X_test)

# Set the size of the plots
plt.figure(figsize=(20, 6))

# Plot each mask
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.imshow(masks[i][:4], aspect='auto')  # Show only the first 3 rows
    plt.title(f"Mask {i}")
    plt.colorbar()  # Add color bar for reference

plt.tight_layout()  # Adjust subplot parameters to give specified padding
plt.show()
