In [0]:
!pip install transformers
!pip install seqeval
!pip install pytorch-crf
import sys
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('./drive/My Drive/MLHC Final Project/final_experiments/')
from bert_ner_utils import *

In [0]:
# Modeling Parameters
BATCH_SIZE = 64
MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS = 126
LEARNING_RATE = 3e-5
EPOCHS = 4
FULL_FINETUNING = True  # True: fine tuning all the layers, False: only fine tuning the classifier layers
USE_GRAD_CLIPPING = False  # If True then we use it
MAX_GRAD_NORM = 2.0
USE_BI_LSTM_CRF = False
HIDDEN_SIZE_LSTM = 512

# General Parameters:
DRIVE_BASE_PATH = "./drive/My Drive/MLHC Final Project"
N_MIMIC_NOTES = 2000
PATH_TO_SAVE_TOKENIZED_DATA_UMLS = (
    f"{DRIVE_BASE_PATH}/final_experiments/tokenized_data_umls" +
    f"_{N_MIMIC_NOTES}_notes_max_len_{MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS}/"
)
PATH_UMLS_MIMIC_DATA = (
  f"{DRIVE_BASE_PATH}/Final Preprocessed Data/{N_MIMIC_NOTES}_notes" +
  f"_discharge_umls_clean_length_{MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS}.csv"
)
PATH_CLINICAL_BERT_MODEL = "emilyalsentzer/Bio_ClinicalBERT"
PATH_SAVE_CLINICAL_BERT_UMLS_WARM_START = (
    f"{DRIVE_BASE_PATH}/final_experiments/umls_warm_start_models/" +
    f"umls_warm_start_model_{N_MIMIC_NOTES}_notes_BS_{BATCH_SIZE}_" +
    f"epochs_{EPOCHS}_lr_{LEARNING_RATE}_gc_{USE_GRAD_CLIPPING}_" +
    f"use_bilstmcrf_{USE_BI_LSTM_CRF}_hs_{HIDDEN_SIZE_LSTM}/"
)

# 1. Processing, Saving and Loading Data

In [0]:
# If this UMLS dataset hasn't already been preprocessed and saved as tokenized
# tensors then run this cell, otherwise run the next one
data_processor_umls = DataProcessorUMLS(
    path_umls_mimic_data=PATH_UMLS_MIMIC_DATA,
    path_clinical_bert_model=PATH_CLINICAL_BERT_MODEL,
    max_len_sent_without_special_tokens=MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS,
    batch_size=BATCH_SIZE,
)
train_dataloader, valid_dataloader = (
    data_processor_umls.get_train_and_valid_dataloader(train_size=0.9)
)
data_processor_umls.save_tokenized_sentences_labels_attention_masks(
    input_ids=data_processor_umls.input_ids,
    attention_masks=data_processor_umls.attention_masks,
    labels_tokenized=data_processor_umls.labels_tokenized,
    path_to_save_tokenized_data_umls=PATH_TO_SAVE_TOKENIZED_DATA_UMLS,
)

In [0]:
# To load preprocessed data directly as tensors and create dataloaders
data_processor_umls = DataProcessorUMLS(
    path_umls_mimic_data=PATH_UMLS_MIMIC_DATA,
    path_clinical_bert_model=PATH_CLINICAL_BERT_MODEL,
    max_len_sent_without_special_tokens=MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS,
    batch_size=BATCH_SIZE,
)
input_ids, attention_masks, labels_tokenized = data_processor_umls.load_tokenized_sentences_labels_attention_masks(
    path_to_save_tokenized_data_umls=PATH_TO_SAVE_TOKENIZED_DATA_UMLS
)
train_dataloader, valid_dataloader = data_processor_umls.get_train_and_valid_dataloader_from_loaded_tokenized_data(
    train_size=0.9,
    input_ids=input_ids,
    attention_masks=attention_masks,
    labels_tokenized=labels_tokenized,
)

# 2. Modeling & Experiments UMLS/MIMIC III

In [0]:
modeling_umls = ModelingUMLS(
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    batch_size=BATCH_SIZE,
    max_len_sent_without_special_tokens=MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS,
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
    use_grad_clipping=USE_GRAD_CLIPPING,
    full_finetuning=FULL_FINETUNING,
    use_bi_lstm_crf=USE_BI_LSTM_CRF,
    hidden_size_lstm=HIDDEN_SIZE_LSTM,
    path_clinical_bert_model=PATH_CLINICAL_BERT_MODEL,
    path_save_clinical_bert_umls=PATH_SAVE_CLINICAL_BERT_UMLS_WARM_START
)
model_ner_clinbert_umls_warm_start = modeling_umls.train_model_umls()

In [0]:
# Save UMLS model in a folder
modeling_umls.save_model_ner_clinbert_umls_warm_start(
    model_ner_clinbert_umls_warm_start=model_ner_clinbert_umls_warm_start,
    tokenizer_clinbert_umls_warm_start=data_processor_umls.tokenizer_clinbert
)