In [None]:
from __future__ import print_function
import numpy as np
np.random.seed(1337) # for reproducibility

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    warnings.filterwarnings("ignore",category=FutureWarning)
    warnings.filterwarnings("ignore",category=UserWarning)
    import sklearn
    import h5py     
    import tensorflow.keras

import tensorflow as tf
    
import nltk
nltk.download('punkt')
import pickle
import os
import codecs
import theano
import jellyfish
import gc
import itertools
import pandas as pd
import collections as col
from collections import Counter 
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import Masking
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import GlobalMaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.python.keras.layers import Input
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Convolution1D, MaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer, InputSpec
from sklearn import model_selection
from nltk import tokenize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from collections import Counter
from attention import AttLayer

earlyStopping = EarlyStopping(patience=2, verbose=1, monitor='val_loss', restore_best_weights=True)         

# Set parameters:
max_features = 150000           # Maximum number of tokens in vocabulary
maxlen = 27                     # Maximum length of each sentence 
maxsents = 211                  # Maximum number of sentences
batch_size = 32                 # Batch size given to the model while training
embedding_dims = 175            # Embedding dimensions
nb_epoch = 50                   # Number of epochs for training
gru_output_size = 175           # GRU output dimension

In [None]:
# Get unique values from a list
def unique(list1): 
  
    unique_list = [] 
      
    for x in list1: 
        if x not in unique_list: 
            unique_list.append(x) 
            
    return unique_list

In [None]:
print('Loading data...')

texts_train = [ line.rstrip('\n') for line in codecs.open('example_train.txt', encoding="utf-8") ]
texts_val = [ line.rstrip('\n') for line in codecs.open('example_val.txt', encoding="utf-8") ]    

texts = texts_train + texts_val

# List of ICD full-codes for each discharge summary:
labels = [ line.split('<>')[maxsents].replace("'","") for line in texts ]
labels = [x[2:-2] for x in labels]
labels = [x.split(', ') for x in labels]

# Remove repeated labels
for i in range (len(labels)):
    labels[i] = unique(labels[i])

# Using sklearn package attribute an integer to each code that occures resulting in:
le = preprocessing.LabelEncoder() 

char = le.fit([item for sublist in labels for item in sublist]) 

labels_int = np.copy(labels)

for i in range(len(labels_int)):
    labels_int[i] = le.transform(labels_int[i])

# Conversion of the ICD code into a one-hot vector
# e.g. diagnosis code D250.00 (in labels) -> 2 (in labels_int) -> [0, 0, 1, 0, ..., 0] (in labels_1hot)

num_classes=1+max([max(x) for x in labels_int])  

labels_1hot = np.zeros((len(labels_int), num_classes), dtype=np.float64)

for i in range(labels_1hot.shape[0]):
    labels_1hot[i,:] = sum(to_categorical(labels_int[i],num_classes))

y_train = labels_1hot[:len(texts_train)]
y_val = labels_1hot[len(texts_train):]

In [None]:
np.save('LABEL_ENCODER.npy', le)

In [None]:
print('Multi-label smothing regularization...')

# This strategy consists of attributing the value y=0.05 to codes belonging to the same blocks as the identified codes
# A block corresponds to the 3-digit version (diagnosis code) or 2-digit version (procedure code)

# List of ICD blocks for each discharge summary:
labels_block = []
for i in range (len(texts_train)):
    labels_block.append(labels[i].copy())

for i in range(len(labels_block)):
    for j in range(len(labels_block[i])):
        if labels_block[i][j][0] =='D' and labels_block[i][j][1] =='E':
            labels_block[i][j] = labels_block[i][j][:5]
        elif labels_block[i][j][0] =='D'and labels_block[i][j][1] !='E':
            labels_block[i][j] = labels_block[i][j][:4]
        else:
            labels_block[i][j] = labels_block[i][j][:3]

# List of the corresponding block of each identified ICD code:
classes_block = []
for i in range (num_classes):
    code = le.inverse_transform([i])[0]
    if code[0] =='D' and code[1] =='E':
        code_block = code[:5]
    elif code[0] =='D'and code[1] !='E':
        code_block = code[:4]
    else:
        code_block = code[:3]
    classes_block.append(code_block)

for i in range (y_train.shape[0]):
    blocks = labels_block[i]
    for j in range (y_train.shape[1]):
        if y_train[i][j]==0:
            if classes_block[j] in blocks:
                y_train[i][j]=0.05

In [None]:
print('y_train shape:', y_train.shape)
print('y_val shape:', y_val.shape)

In [None]:
# Discharge summaries
ds = []
for i in range (maxsents):
    ds.append([ line.split('<>')[i] for line in texts ])

# Spliting the discharge summaries into a training set and a validation set_

# Training set
X_train_ds = []
for i in range (maxsents):
    X_train_ds_i = []
    for j in range (len(texts_train)):
        X_train_ds_i.append(ds[i][j])
    X_train_ds.append(X_train_ds_i)

# Validation set
X_val_ds = []
for i in range (maxsents):
    X_val_ds_i = []
    for j in range (len(texts_train),len(texts)):
        X_val_ds_i.append(ds[i][j])
    X_val_ds.append(X_val_ds_i)

In [None]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts([item for sublist in X_train_ds for item in sublist])

# Attribute an integer to each token that occures in the texts 
# Conversion of each dataset entry in a (maxsents, maxlen) shape matrix resulting in variables:

print('Computing training set...')

X_train = np.zeros((len(X_train_ds[0]), maxsents, maxlen), dtype = 'int32')

print('Loading discharge summaries ...')

for m in range(maxsents):
    part = X_train_ds[m]
    for i, sentences in enumerate(part):
        sentences = tokenize.sent_tokenize( sentences )
        k = 0
        for j, sent in enumerate(sentences):
            if j < maxsents:
                wordTokens = text_to_word_sequence(sent)
                for _ , word in enumerate(wordTokens):
                    if k < maxlen and tokenizer.word_index[word] < max_features:
                        X_train[i,m,k] = tokenizer.word_index[word]
                        k = k + 1

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

np.save('DICT.npy', word_index)

In [None]:
print('Computing validation set...')

X_val = np.zeros((len(X_val_ds[0]), maxsents, maxlen), dtype = 'int32')

print('Loading discharge summaries...')

for m in range(maxsents):
    part = X_val_ds[m]
    for i, sentences in enumerate(part):
        sentences = tokenize.sent_tokenize( sentences )
        k = 0
        for j, sent in enumerate(sentences):
            wordTokens = text_to_word_sequence(sent)
            for _ , word in enumerate(wordTokens):
                if word_index.get(word) == None: 
                    aux = [(jellyfish.jaro_winkler(k,word),v) for k,v in word_index.items()]
                    if k < maxlen and max(aux)[1] < max_features:
                        X_val[i,m,k] = max(aux)[1]
                        k = k + 1
                else:
                    if k < maxlen and word_index.get(word) < max_features:
                        X_val[i,m,k] = word_index.get(word)
                        k = k + 1

In [None]:
print('X_train shape:', X_train.shape)
print('X_val shape:', X_val.shape)

In [None]:
print('Loading pretrained word embeddings ...')

#embedding_matrix = np.load('embeddins_matrix.npy')

# For demonstratrion purposes let us assume a embedding matrix as an array filled with zeros:
embedding_matrix = np.zeros((len(word_index)+1, embedding_dims))

In [None]:
# Defining the loss function as the conjugation of the binary cross-entropy (BCE) with the log-cosh Tvsersky loss (LCTL):
def LCTL(y_true, y_pred, beta):
    y_true = tf.compat.v1.layers.flatten(y_true)
    y_true = tf.math.round(y_true)
    y_pred = tf.compat.v1.layers.flatten(y_pred)
    numerator = tf.reduce_sum(y_true * y_pred, axis=-1)
    denominator = y_true * y_pred + beta * (1 - y_true) * y_pred + (1 - beta) * y_true * (1 - y_pred)
    loss =  1 - (numerator + 1) / (tf.reduce_sum(denominator, axis=-1) + 1)
    loss = K.log((K.exp(loss) + K.exp(-loss)) / 2.0)
    return loss

BCE = tf.keras.losses.BinaryCrossentropy()

def loss_function(alpha, beta):
    def loss(y_true, y_pred):
        return alpha * BCE(y_true, y_pred) + (1 - alpha) * LCTL(y_true, y_pred, beta)
    return loss

In [None]:
print('Build model...')

# Inputs
review_input = Input(shape=(maxsents,maxlen), dtype='int32')

# Embedding Layer
embedding_layer = Embedding(len(word_index)+1, embedding_dims, 
                            input_length=maxlen,
                            weights=[embedding_matrix])

# WORD-LEVEL
sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)

# Bidirectional GRU
l_gru = Bidirectional(GRU(gru_output_size, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(units=gru_output_size))(l_gru)

# Word-Level Attention Layer
l_att = AttLayer()(l_dense)
sentEncoder = Model(sentence_input, l_att)
review_encoder = TimeDistributed(sentEncoder)(review_input)

# SENTENCE_LEVEL
# Bidirectional GRU
l_gru_sent = Bidirectional(GRU(gru_output_size, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(units=gru_output_size))(l_gru_sent)

# Sentence-Level Attention Layer
postp = AttLayer()(l_dense_sent)

# Embedding Average
sentEmbed = Model(sentence_input, embedded_sequences)
review_fasttext = TimeDistributed(sentEmbed)(review_input)
fasttext = GlobalAveragePooling2D()(review_fasttext)

postp_aux = tensorflow.keras.layers.Concatenate( axis = 1 )( [ postp , fasttext ] )

postp_aux_drop = Dropout(0.05)(postp_aux)

postp = Dense(units=(gru_output_size+embedding_dims))(postp_aux_drop)

# Sigmoid Layer
preds_aux = Dense(units=num_classes, activation='sigmoid')(postp)

model = Model(inputs = review_input, outputs = preds_aux)

model.compile(loss=[loss_function(alpha=0.9, beta=0.5)], optimizer='adam', 
              metrics=['accuracy'])

model.fit(X_train, y_train,
          validation_data=(X_val, y_val), 
          epochs=nb_epoch,
          batch_size=batch_size,
          callbacks=[earlyStopping])

model.save('model.h5')