## Preparing the Data

With modifications from AI for Healthcare with Keras and Tensorflow 2.0 by Anshik Bansal (Apress, 2021).

https://github.com/Apress/ai-for-healthcare-keras-tensorflow-2.0

In [None]:
import pandas as pd
import numpy as np

In [None]:
BASEDIR = '/Volumes/ExternalData/Data/mimiciii/1.4'

icd9_code = pd.read_csv(f"{BASEDIR}/DIAGNOSES_ICD.csv", index_col = None)

In [None]:
n_rows = 100000

# create the iterator
noteevents_iterator = pd.read_csv(
    f"{BASEDIR}/NOTEEVENTS.csv",
    iterator=True,
    chunksize=n_rows)

# concatenate according to a filter to get our noteevents data
noteevents = pd.concat(
    [noteevents_chunk[np.logical_and(noteevents_chunk.CATEGORY.isin(["Discharge summary"]),
                                     noteevents_chunk.DESCRIPTION.isin(["Report"]))]
    for noteevents_chunk in noteevents_iterator])
noteevents.HADM_ID = noteevents.HADM_ID.astype(int)

In [None]:
noteevents.columns

In [None]:
# thetaphipsi

columns = ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR'] #, 'TEXT']

noteevents[columns].sort_values(by='SUBJECT_ID').groupby(by='SUBJECT_ID').count()

In [None]:
noteevents[noteevents.groupby(['SUBJECT_ID'])['ROW_ID'].transform('count') > 33]

In [None]:
try:
    assert len(noteevents.drop_duplicates(["SUBJECT_ID","HADM_ID"])) == len(noteevents)
except AssertionError as e:
    print("There are duplicates on Primary Key Set")

In [None]:
# Seeing if discharge summaries are different for repeating (SUBJECT_ID, HADM_ID) pair.
pd.set_option('display.max_colwidth',1)
noteevents[noteevents.duplicated(subset = ["SUBJECT_ID","HADM_ID"], keep = False)].sort_values(["SUBJECT_ID"])[['SUBJECT_ID', 'HADM_ID', 'TEXT']].head(2)

In [None]:
noteevents.CHARTDATE  = pd.to_datetime(noteevents.CHARTDATE , format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [None]:
pd.set_option('display.max_colwidth',50)
noteevents.sort_values(["SUBJECT_ID","HADM_ID","CHARTDATE"], inplace =True)
noteevents.drop_duplicates(["SUBJECT_ID","HADM_ID"], inplace = True)

noteevents.reset_index(drop = True, inplace = True)

In [None]:
top_values = (icd9_code.groupby('ICD9_CODE').
              agg({"SUBJECT_ID": "nunique"}).
              reset_index().sort_values(['SUBJECT_ID'], ascending = False).ICD9_CODE.tolist()[:15])

In [None]:
icd9_code = icd9_code[icd9_code.ICD9_CODE.isin(top_values)]

In [None]:
import re
import itertools

def clean_text(text):
    return [x for x in list(itertools.chain.from_iterable([t.split("<>") for t in text.replace("\n"," ").split("|")])) if len(x) > 0]


most_frequent_tags = [re.match("^(.*?):",x).group() for text in noteevents.TEXT for x in text.split("\n\n") if pd.notnull(re.match("^(.*?):",x))]
pd.Series(most_frequent_tags).value_counts().head(10)

In [None]:
irrelevant_tags = ["Admission Date:", "Date of Birth:", "Service:", "Attending:", "Facility:", "Medications on Admission:", "Discharge Medications:", "Completed by:",
"Dictated By:" , "Department:" , "Provider:"]

updated_text = ["<>".join(["|".join(re.split("\n\d|\n\s+",re.sub("^(.*?):","",x).strip())) for x in text.split("\n\n") if pd.notnull(re.match("^(.*?):",x)) and re.match("^(.*?):",x).group() not in irrelevant_tags ]) for text in noteevents.TEXT]
updated_text = [re.sub("(\[.*?\])", "", text) for text in updated_text]

updated_text = ["|".join(clean_text(x)) for x in updated_text]

In [None]:
noteevents["CLEAN_TEXT"] = updated_text

In [None]:
print(noteevents[:1].TEXT[0])

In [None]:
noteevents[:1].CLEAN_TEXT[0]

In [None]:
df = noteevents[['CLEAN_TEXT']][noteevents['SUBJECT_ID']==10]

if len(df) > 1:
    texts = df.squeeze().to_list()
else:
    texts = df.squeeze()
    
texts

# Training Multi-Label Model

In [None]:
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [None]:
# thetaphipsi

import scispacy
import spacy

nlp = spacy.load("en_core_sci_sm")
text = """
DISCHARGE DIAGNOSES:
 1.  Cardiorespiratory arrest.
"""
doc = nlp(text)

In [None]:
print(list(doc.sents))

In [None]:
print(doc.ents)

In [None]:
from spacy import displacy
displacy.render(next(doc.sents), style='dep', jupyter=True)

In [None]:
!pip install transformers

In [None]:
!pip install tensorflow

In [None]:
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
import tensorflow as tf

# For data processing
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-large-cased-v1.1')

In [None]:
vocab = tokenizer.vocab.keys()

In [None]:
# Total Length
print("Total Length of Vocabulary words are : ", len(vocab))

In [None]:
import spacy
import scispacy

from scispacy.linking import EntityLinker
#nlp = spacy.load('./en_core_sci_lg')
nlp = spacy.load('./en_core_sci_lg-0.4.0/en_core_sci_lg/en_core_sci_lg-0.4.0')

In [None]:
linker = EntityLinker(resolve_abbreviations=False, name="umls") # keeping default thresholds for match percentage.
#nlp.add_pipe(linker)
nlp.add_pipe("scispacy_linker")

In [None]:
target_vocab = [word[2:] for word in vocab if "##" in word and (len(word[2:]) > 3)] + [word[2:] for word in vocab if "##" not in word and (len(word) > 3)]

In [None]:
umls_concept_extracted = [[umls_ent for entity in doc.ents for umls_ent in entity._.umls_ents] for doc in nlp.pipe(target_vocab)]

In [None]:
umls_concept_cui = [linker.kb.cui_to_entity[concepts[0][0]] for concepts in umls_concept_extracted if len(concepts) > 0]
# Capturing all the information shared from the UMLS DB in a dataframe
umls_concept_df = pd.DataFrame(umls_concept_cui)

In [None]:
umls_concept_df.to_csv("umls_concepts.csv", index = None)

In [None]:
# UMLs provides a class name to each of its TXXX identifier, TXXX is code for parents for each of the CUI numbers a unique concept
# identifier used by UMLs Kb

# To obtain this file please login to https://www.nlm.nih.gov/research/umls/index.html
# Shared in Github Repo of the book :)
type2namemap = pd.read_csv("SRDEF", sep ="|", header = None)
type2namemap = type2namemap.iloc[:,:3]
type2namemap.columns = ["ClassType","TypeID","TypeName"]
typenamemap = {row["TypeID"]:row["TypeName"] for i,row in type2namemap.iterrows()}

In [None]:
concept_df = pd.Series([typenamemap[typeid] for types in umls_concept_df.types for typeid in types]).value_counts().reset_index()
concept_df.columns = ["concept","count"]

In [None]:
import plotly.express as px
fig = px.pie(concept_df.head(20), values='count', names='concept', title='Count of Biomedical Concepts in BERT Pre-trained Model')
fig.show()

In [None]:
# Multiple counting is very less as most of the concepts have single TXXX id mapped to it.
pd.Series([len(types) for types in umls_concept_df.types]).value_counts()

In [None]:
import pickle
with open('linker_umls.pickle', 'wb') as handle:
    pickle.dump(umls_concept_extracted, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
subword_len = [len(x.replace("##","")) for x in vocab]
token_len = [len(x) for x in vocab]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

with sns.plotting_context(font_scale=2):
    fig, axes = plt.subplots(1,2, figsize=(10, 6))
    sns.countplot(subword_len, palette="Set2", ax=axes[0])
    sns.despine()
    axes[0].set_title("Subword length distribution")
    axes[0].set_xlabel("Length in characters")
    axes[0].set_ylabel("Frequency")
    
    sns.countplot(token_len, palette="Set2", ax=axes[1])
    sns.despine()
    axes[1].set_title("Token length distribution")
    axes[1].set_xlabel("Length in characters")
    axes[1].set_ylabel("Frequency")

## Preparing the Data

In [None]:
# Making icd9_code unique at SUBJECT ID and HADM_ID level by clubbing different ICD9_CODE
icd9_code = icd9_code.groupby(["SUBJECT_ID","HADM_ID"])["ICD9_CODE"].apply(list).reset_index()

full_data = pd.merge(noteevents, icd9_code, how="left", on = ["SUBJECT_ID","HADM_ID"])

# Removing any SUBJECT_ID and HADM_ID pair not having the top 15 ICD9 Codes
full_data = full_data.dropna(subset = ["ICD9_CODE"]).reset_index(drop = True)

# Make sure we have text of considerable length
full_data.CLEAN_TEXT = [" ".join([y for y in x.split("|") if len(y.split()) > 3]) for x in full_data.CLEAN_TEXT]

full_data.ICD9_CODE = full_data.ICD9_CODE.apply(lambda x : "|".join(x))

In [None]:
full_data.ICD9_CODE

In [None]:
full_data.to_csv("./data.csv", index = None)

In [None]:
diseases = []
for icd9 in full_data.ICD9_CODE:
  diseases.extend(icd9.split('|'))

In [None]:
len(set(diseases))

In [None]:
# Binarizing the multi- labels
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

mlb = MultiLabelBinarizer()
mlb_fit = mlb.fit([full_data.ICD9_CODE.tolist()])

train_X,val_X,train_y,val_y = train_test_split(full_data[["SUBJECT_ID","HADM_ID","CLEAN_TEXT"]],full_data.ICD9_CODE.values,
                                              test_size=0.2, random_state=42)

In [None]:
mlb_fit.classes_

In [None]:
full_data

In [None]:
# Import BERT Model
from transformers import BertModel, BertConfig, TFBertModel
config = BertConfig.from_json_file('./dmis_biobert_large/config.json')
bert = TFBertModel.from_pretrained("./dmis_biobert_large/pytorch_model.bin",
                                   config = config,
                                   from_pt = True)

In [None]:
EPOCHS = 5
BATCH_SIZE = 32
MAX_LEN = 510
LR = 2e-5
NUM_LABELS = 15 # Since we have 15 classes to predict for

def df_to_dataset(dataframe, 
                  dataframe_labels,
                  batch_size = BATCH_SIZE, 
                  max_length = MAX_LEN,
                  tokenizer  = tokenizer):
    """
    Loads data into a tf.data.Dataset for finetuning a given model.
    """
    while True:
        for i in range(len(dataframe)):
            if (i+1) % batch_size == 0:
                multiplier = int((i+1)/batch_size)
                print(multiplier)
                _df = dataframe.iloc[(multiplier-1)*batch_size:multiplier*batch_size,:]
                # encode_plus is something that can only be used with a Fast Tokenizer like the one we are using
                input_df_dict = tokenizer(
                    _df.CLEAN_TEXT.tolist(),
                    add_special_tokens=True,
                    max_length=max_length, # TO truncate larger sentences, similar to truncation = True
                    truncation=True,
                    return_token_type_ids=True,
                    return_attention_mask=True,
                    padding='max_length', # right padding
                )
                input_df_dict = {k:np.array(v) for k,v in input_df_dict.items()}
                yield input_df_dict, mlb_fit.transform(dataframe_labels[(multiplier-1)*batch_size:multiplier*batch_size])
    
# Note out final data is of the shape ['input_ids', 'attention_mask', 'token_type_ids', 'label']

In [None]:
train_gen = df_to_dataset(train_X.reset_index(drop = True),
                        train_y)
val_gen = df_to_dataset(val_X.reset_index(drop = True),
                       val_y)

In [None]:
from tensorflow.keras import layers
def create_final_model(bert_model = bert):
    
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    token_type_ids = layers.Input((MAX_LEN,), dtype=tf.int32, name='token_type_ids')
    attention_mask = layers.Input((MAX_LEN,), dtype=tf.int32, name='attention_mask')
    
    # Use pooled_output(hidden states of [CLS]) as sentence level embedding
    cls_output = bert_model({'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids})[1]
    x = layers.Dense(512, activation='selu')(cls_output)
    x = layers.Dense(256, activation='selu')(x)
    x = layers.Dropout(rate=0.1)(x)
    x = layers.Dense(NUM_LABELS, activation='sigmoid')(x)
    model = tf.keras.models.Model(inputs={'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids}, outputs=x)
    return model

In [None]:
model = create_final_model(bert_model = bert)

In [None]:
# Also we will make sure that we are only learning the custom layers atleast for the few first epochs, then we can learn the whole network
for layers in bert.layers:
    print(layers.name)
    layers.trainable= False

In [None]:
model.summary()

In [None]:
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=LR),
              loss='binary_crossentropy',
              metrics=['AUC'])

In [None]:
# We can also run the following on a GPU device as well.

In [None]:
history = model.fit(train_gen,
                    steps_per_epoch=len(train_X)//BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=val_gen,
                    validation_steps=len(val_X)//BATCH_SIZE)

In [None]:
tf.__version__

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Loading Pytorch Model from TF checkpoint

In [None]:
import os

import numpy as np
import tensorflow as tf
import torch

from transformers import BertModel

In [None]:
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):

    """
    Args:
        model: BertModel Pytorch model instance to be converted
        ckpt_dir: Tensorflow model directory
        model_name: model name
    Currently supported HF models:
        - Y BertModel
        - N BertForMaskedLM
        - N BertForPreTraining
        - N BertForMultipleChoice
        - N BertForNextSentencePrediction
        - N BertForSequenceClassification
        - N BertForQuestionAnswering
    """

    tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")

    var_map = (
        ("layer.", "layer_"),
        ("word_embeddings.weight", "word_embeddings"),
        ("position_embeddings.weight", "position_embeddings"),
        ("token_type_embeddings.weight", "token_type_embeddings"),
        (".", "/"),
        ("LayerNorm/weight", "LayerNorm/gamma"),
        ("LayerNorm/bias", "LayerNorm/beta"),
        ("weight", "kernel"),
    )

    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    state_dict = model.state_dict()

    def to_tf_var_name(name: str):
        for patt, repl in iter(var_map):
            name = name.replace(patt, repl)
        return "bert/{}".format(name)

    def create_tf_var(tensor: np.ndarray, name: str, session: tf.compat.v1.Session):
        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
        tf_var = tf.compat.v1.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
        session.run(tf.compat.v1.variables_initializer([tf_var]))
        session.run(tf_var)
        return tf_var

    tf.compat.v1.reset_default_graph()
    with tf.compat.v1.Session() as session:
        for var_name in state_dict:
            tf_name = to_tf_var_name(var_name)
            torch_tensor = state_dict[var_name].numpy()
            if any([x in var_name for x in tensors_to_transpose]):
                torch_tensor = torch_tensor.T
            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
            tf.keras.backend.set_value(tf_var, torch_tensor)
            tf_weight = session.run(tf_var)
            print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))

        saver = tf.compat.v1.train.Saver(tf.compat.v1.trainable_variables())
        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
   

In [None]:
model = BertModel.from_pretrained(
        pretrained_model_name_or_path="dmis-lab/biobert-large-cased-v1.1",
#         state_dict=torch.load("./dmis-lab_biobert-large-cased-v1.1/pytorch_model.bin"),
        cache_dir="./dmis-lab_biobert-large-cased-v1.1")

convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir="./tf_dmis-lab_biobert-large-cased-v1.1", model_name="tf_biobert-large-cased-v1.1")


In [None]:
# Import BERT Model from TF Checkpoint
from transformers import BertModel, BertConfig
config = BertConfig.from_json_file('./config.json')
bert = BertModel.from_pretrained("./tf_dmis-lab_biobert-large-cased-v1.1/tf_biobert_large_cased_v1.1.ckpt.index",
                                   from_tf=True,
                                config = config)

In [None]:
#from transformers import AutoTokenizer, AutoModelForMaskedLM

#tokenizer = AutoTokenizer.from_pretrained("jamesmullenbach/CLIP_TTP_BERT_Context_250k")
#model = AutoModelForMaskedLM.from_pretrained("jamesmullenbach/CLIP_TTP_BERT_Context_250k")

from transformers import AutoTokenizer, AutoModel
  
tokenizer = AutoTokenizer.from_pretrained("jamesmullenbach/CLIP_DNote_BERT_Context")
model = AutoModel.from_pretrained("jamesmullenbach/CLIP_DNote_BERT_Context")

inputs = tokenizer("Hello world!", return_tensors="pt")
outputs = model(**inputs)

In [None]:
inputs

In [None]:
outputs