This kernel is different from "Reproduce Preprocessing and Modelling" since it takes the same input as the original Theano model, but uses a new Keras architecture.

In [1]:
import os
import random
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Dropout, Masking, GlobalMaxPooling1D, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback
import tensorflow.keras.backend as K
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "9"

tf.test.is_gpu_available()

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=9


True

In [3]:
def load_file(file):
    with open(file, 'rb') as f:
        loaded = pickle.load(f, encoding='latin1')
       
    return loaded

In [4]:
# Load all the files
full_icd9_seqs = load_file('../doctorai/full_icd9.seqs')
full_icd9_dates = load_file('../doctorai/full_icd9.dates')
full_icd9_pids = load_file('../doctorai/full_icd9.pids')
full_icd9_types = load_file('../doctorai/full_icd9.types')

small_icd9_seqs = load_file('../doctorai/3digit_icd9.seqs')
small_icd9_dates = load_file('../doctorai/3digit_icd9.dates')
small_icd9_pids = load_file('../doctorai/3digit_icd9.pids')
small_icd9_types = load_file('../doctorai/3digit_icd9.types')


# Generating train-val-test indices
train_ratio = 0.8
val_ratio = 0.1

indices = list(range(len(full_icd9_seqs)))
random.seed(2019)
random.shuffle(indices)

train_split =  int(train_ratio * len(full_icd9_seqs))
val_split = int(val_ratio * len(full_icd9_seqs))
train_idx = indices[:train_split]
val_idx = indices[train_split:train_split+val_split]
test_idx = indices[train_split+val_split:]

## Creating lag

In [5]:
visits = [ls[:-1] for ls in full_icd9_seqs]
labels = [ls[1:] for ls in small_icd9_seqs]

## Padding

In [6]:
n_visits_max = max([len(visit) for visit in visits])
n_diagnosis_max = max([max(map(len, visit)) for visit in visits])

print("Max number of visits per patient:", n_visits_max)
print("Max number of diagnosis in a visit:", n_diagnosis_max)

Max number of visits per patient: 41
Max number of diagnosis in a visit: 39


In [7]:
visits_padded = pad_sequences([
    pad_sequences(visit, maxlen=n_diagnosis_max) 
    for visit in visits
])

visits_padded.shape

(7537, 41, 39)

## Label encoding and padding

In [8]:
labels_flat = []
for label in labels:
    labels_flat.extend(label)

label_enc = MultiLabelBinarizer()
label_enc.fit(labels_flat)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [9]:
labels_padded = pad_sequences([label_enc.transform(label) for label in labels])
labels_padded.shape

(7537, 41, 846)

## Train test split

In [10]:
X_train = visits_padded[train_idx]
X_val = visits_padded[val_idx]
X_test = visits_padded[test_idx]

y_train = labels_padded[train_idx]
y_val = labels_padded[val_idx]
y_test = labels_padded[test_idx]

print("Number of training patients:", len(X_train), len(y_train))
print("Number of validation patients:", len(X_val), len(y_val))
print("Number of test patients:", len(X_test), len(y_test))

Number of training patients: 6029 6029
Number of validation patients: 753 753
Number of test patients: 755 755


# Modelling

## Evaluation metric

In [22]:
def top_k_recall(y_true, y_pred, use_tqdm=True, k=30):
    pred_flat = y_pred.reshape(-1, 846)
    true_flat = y_true.reshape(-1, 846)
    
    all_patients_recall = []

    for adm_idx in tqdm(range(true_flat.shape[0]), disable=not use_tqdm):
        true_indices = np.argwhere(true_flat[adm_idx] == 1).reshape(-1)

        # If this admission does not have any diagnosis, then it
        # is a dummy admission created by padding from keras
        if true_indices.shape[0] > 0:
            pred_indices = pred_flat[adm_idx].argsort()[-k:]

            intersection_count = len(np.intersect1d(pred_indices, true_indices))

            recall = intersection_count / len(true_indices)
            all_patients_recall.append(recall)

    all_patients_recall = np.array(all_patients_recall)
    return all_patients_recall

## Custom callback to monitor top-k recall

In [12]:
class TopKRecallCallback(Callback):
    def __init__(self, X_test, y_test):
        super().__init__()
        self.X_test = X_test
        self.y_test = y_test
        
    def on_train_begin(self, logs={}):
        self.test_recalls_at_10 = []
        self.test_recalls_at_20 = []
        self.test_recalls_at_30 = []

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_test, batch_size=512)
        y_test = self.y_test
        
        _test_recall_at_10 = top_k_recall(y_test, y_pred, k=10, use_tqdm=False).mean()
        _test_recall_at_20 = top_k_recall(y_test, y_pred, k=20, use_tqdm=False).mean()
        _test_recall_at_30 = top_k_recall(y_test, y_pred, k=30, use_tqdm=False).mean()
        
        self.test_recalls_at_10.append(_test_recall_at_10)
        self.test_recalls_at_20.append(_test_recall_at_20)
        self.test_recalls_at_30.append(_test_recall_at_30)

        print(f"\ntest_top_k_recall: {_test_recall_at_10:.4f}@10; {_test_recall_at_20:.4f}@20; {_test_recall_at_30:.4f}@30")

        return

## Building Model

In [13]:
def build_model(num_words, embedding_dim=200):
    input1 = Input(shape=(None, None))
    x = Embedding(num_words, embedding_dim, mask_zero=False)(input1)
    x = K.sum(x, axis=2)
    x = Masking(0)(x)
    
    x = GRU(200, return_sequences=True)(x)
    x = GRU(200, return_sequences=True)(x)
    
    x = Dense(300, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(846, activation='softmax')(x)

    model = Model(inputs=input1, outputs=x)
    model.compile('adam', loss='binary_crossentropy')
    
    return model

In [23]:
num_words = max([X_train.max(), X_val.max(), X_test.max()]) + 1
num_words

4892

In [24]:
model = build_model(num_words)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, None, None)]      0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, None, 200)   978400    
_________________________________________________________________
tf_op_layer_Sum_2 (TensorFlo [(None, None, 200)]       0         
_________________________________________________________________
masking_2 (Masking)          (None, None, 200)         0         
_________________________________________________________________
gru_4 (GRU)                  (None, None, 200)         241200    
_________________________________________________________________
gru_5 (GRU)                  (None, None, 200)         241200    
_________________________________________________________________
dense_4 (Dense)              (None, None, 300)         6030

In [26]:
callbacks = [
    TopKRecallCallback(X_test=X_test, y_test=y_test)
]

model.fit(
    X_train, 
    y_train, 
    batch_size=64, 
    epochs=10, 
    verbose=2,
    callbacks=callbacks
)

Train on 6029 samples
Epoch 1/10
 - test_top_k_recall: 0.2773@10; 0.4125@20; 0.4966@30
6029/6029 - 7s - loss: 0.0043
Epoch 2/10
 - test_top_k_recall: 0.2873@10; 0.4203@20; 0.5111@30
6029/6029 - 7s - loss: 0.0043
Epoch 3/10
 - test_top_k_recall: 0.2958@10; 0.4313@20; 0.5261@30
6029/6029 - 7s - loss: 0.0042
Epoch 4/10
 - test_top_k_recall: 0.3015@10; 0.4419@20; 0.5320@30
6029/6029 - 7s - loss: 0.0042
Epoch 5/10
 - test_top_k_recall: 0.3134@10; 0.4489@20; 0.5375@30
6029/6029 - 7s - loss: 0.0042
Epoch 6/10
 - test_top_k_recall: 0.3090@10; 0.4466@20; 0.5400@30
6029/6029 - 7s - loss: 0.0042
Epoch 7/10
 - test_top_k_recall: 0.3233@10; 0.4629@20; 0.5569@30
6029/6029 - 7s - loss: 0.0042
Epoch 8/10
 - test_top_k_recall: 0.3270@10; 0.4728@20; 0.5640@30
6029/6029 - 7s - loss: 0.0042
Epoch 9/10
 - test_top_k_recall: 0.3296@10; 0.4757@20; 0.5667@30
6029/6029 - 7s - loss: 0.0041
Epoch 10/10
 - test_top_k_recall: 0.3393@10; 0.4841@20; 0.5738@30
6029/6029 - 7s - loss: 0.0041


<tensorflow.python.keras.callbacks.History at 0x7fcd6a5ec4d0>

# Evaluation

## Eval on train

In [29]:
%%time
with tf.device('/gpu:0'):
    y_train_pred = model.predict(X_train, batch_size=512)

CPU times: user 19.1 s, sys: 59.4 s, total: 1min 18s
Wall time: 2.43 s


In [30]:
all_patients_recall = top_k_recall(y_true=y_train, y_pred=y_train_pred)
all_patients_recall.mean()

100%|██████████| 247189/247189 [00:03<00:00, 74653.92it/s]


0.5995579711625877

## Eval on test

In [31]:
%%time
with tf.device('/gpu:0'):
    y_pred = model.predict(X_test, batch_size=512)

CPU times: user 1.61 s, sys: 2.68 s, total: 4.29 s
Wall time: 285 ms


In [32]:
all_patients_recall = top_k_recall(y_true=y_test, y_pred=y_pred)
all_patients_recall.mean()

100%|██████████| 30955/30955 [00:00<00:00, 64654.71it/s]


0.5737620405124925

## Eval using DoctorAI metric

In [46]:
def recallTop(y_true, y_pred, rank=[10, 20, 30]):
    y_pred = y_pred.reshape(-1, 846)
    y_true = y_true.reshape(-1, 846)
    
    recall = list()
    for i in range(len(y_pred)):
        thisOne = list()
        codes = y_true[i]
        tops = y_pred[i]
        for rk in rank:
            thisOne.append(len(set(codes).intersection(set(tops[:rk])))*1.0/len(set(codes)))
        recall.append( thisOne )
    return (np.array(recall)).mean(axis=0).tolist()