In [0]:
!pip install transformers
!pip install seqeval
!pip install pytorch-crf
import sys
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('./drive/My Drive/MLHC Final Project/final_experiments/')
from bert_ner_utils import *

In [0]:
# Modeling Parameters from UMLS to be able to load the saved model
# Needs to be modified every time you want to load another post-pretrained model
N_MIMIC_NOTES_UMLS = 1000
BATCH_SIZE_UMLS = 64
MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS_UMLS = 126
LEARNING_RATE_UMLS = 2e-5
EPOCHS_UMLS = 4
FULL_FINETUNING_UMLS = True  # True: fine tuning all the layers, False: only fine tuning the classifier layers
USE_GRAD_CLIPPING_UMLS = True  # If True then we use it
MAX_GRAD_NORM_UMLS = 2.0

In [0]:
# Modeling Parameters for i2b2
BATCH_SIZE = 16
MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS = 126
LEARNING_RATE = 2e-5
EPOCHS = 3
FULL_FINETUNING = True  # True: fine tuning all the layers, False: only fine tuning the classifier layers
USE_GRAD_CLIPPING = False  # If True then we use it
MAX_GRAD_NORM = 2.0
USE_BI_LSTM_CRF = False
HIDDEN_SIZE_LSTM = 768

# General Parameters:
DRIVE_BASE_PATH = "./drive/My Drive/MLHC Final Project"
PATH_TO_SAVE_TOKENIZED_DATA_I2B2 = (
    f"{DRIVE_BASE_PATH}/final_experiments/tokenized_data_i2b2_126/"
)
PATH_I2B2_FOLDER = (
  f"{DRIVE_BASE_PATH}/Formatted i2b2 Data/" +
  f"processed_data_sent_len_{MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS}/"
)
#PATH_CLINICAL_BERT_MODEL = "emilyalsentzer/Bio_ClinicalBERT"
PATH_CLINICAL_BERT_UMLS_WARM_START_MODEL = (
    f"{DRIVE_BASE_PATH}/final_experiments/umls_warm_start_models/" +
    f"umls_warm_start_model_{N_MIMIC_NOTES_UMLS}_notes_BS_{BATCH_SIZE_UMLS}_epochs_{EPOCHS_UMLS}_lr_{LEARNING_RATE_UMLS}_gc_{USE_GRAD_CLIPPING_UMLS}/"
)
PATH_SAVE_CLINICAL_BERT_I2B2_WITH_WARM_START = (
    f"{DRIVE_BASE_PATH}/final_experiments/i2b2_models_with_warm_start/" +
    f"i2b2_with_warm_start_model_BS_{BATCH_SIZE}_epochs_{EPOCHS}_lr_{LEARNING_RATE}_gc_{USE_GRAD_CLIPPING}/"
)

# 1. Processing, Saving and Loading Data

In [0]:
# If the i2b2 dataset hasn't already been preprocessed and saved as tokenized
# tensors then run this cell, otherwise run the next one
data_processor_i2b2 = DataProcessorI2B2(
    path_i2b2_folder=PATH_I2B2_FOLDER,
    path_clinical_bert_model=PATH_CLINICAL_BERT_UMLS_WARM_START_MODEL,
    max_len_sent_without_special_tokens=MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS,
    batch_size=BATCH_SIZE
)
train_dataloader, dev_dataloader, test_dataloader = (
    data_processor_i2b2.get_train_dev_and_test_dataloader_i2b2()
)
for input_ids, attention_masks, labels_tokenized, data_part in zip(
    [data_processor_i2b2.input_ids_train, data_processor_i2b2.input_ids_dev, data_processor_i2b2.input_ids_test],
    [data_processor_i2b2.attention_masks_train, data_processor_i2b2.attention_masks_dev, data_processor_i2b2.attention_masks_test],
    [data_processor_i2b2.labels_tokenized_train, data_processor_i2b2.labels_tokenized_dev, data_processor_i2b2.labels_tokenized_test],
    ["train", "dev", "test"],
):
  data_processor_i2b2.save_tokenized_sentences_labels_attention_masks(
          input_ids=input_ids,
          attention_masks=attention_masks,
          labels_tokenized=labels_tokenized,
          path_to_save_tokenized_data_i2b2=PATH_TO_SAVE_TOKENIZED_DATA_I2B2,
          data_part=data_part,
  )

In [28]:
# To load preprocessed data directly as tensors and create dataloaders
data_processor_i2b2 = DataProcessorI2B2(
    path_i2b2_folder=PATH_I2B2_FOLDER,
    path_clinical_bert_model=PATH_CLINICAL_BERT_UMLS_WARM_START_MODEL,
    max_len_sent_without_special_tokens=MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS,
    batch_size=BATCH_SIZE
)
# Train Dataloader
input_ids_train, attention_masks_train, labels_tokenized_train = data_processor_i2b2.load_tokenized_sentences_labels_attention_masks(
        path_to_save_tokenized_data_i2b2=PATH_TO_SAVE_TOKENIZED_DATA_I2B2,
        data_part="train",
)
length_cut = int(0.7 * len(input_ids_train))
train_dataloader_cut = data_processor_i2b2.get_train_dev_and_test_dataloader_i2b2_from_loaded_tokenized_data(
    input_ids=input_ids_train[:length_cut],
    attention_masks=attention_masks_train[:length_cut],
    labels_tokenized=labels_tokenized_train[:length_cut],
    data_part="train"
)

# Dev Dataloader
input_ids_dev, attention_masks_dev, labels_tokenized_dev = data_processor_i2b2.load_tokenized_sentences_labels_attention_masks(
        path_to_save_tokenized_data_i2b2=PATH_TO_SAVE_TOKENIZED_DATA_I2B2,
        data_part="dev",
)
dev_dataloader = data_processor_i2b2.get_train_dev_and_test_dataloader_i2b2_from_loaded_tokenized_data(
    input_ids=input_ids_dev,
    attention_masks=attention_masks_dev,
    labels_tokenized=labels_tokenized_dev,
    data_part="dev"
)

# Test Dataloader
input_ids_test, attention_masks_test, labels_tokenized_test = data_processor_i2b2.load_tokenized_sentences_labels_attention_masks(
        path_to_save_tokenized_data_i2b2=PATH_TO_SAVE_TOKENIZED_DATA_I2B2,
        data_part="test",
)
test_dataloader = data_processor_i2b2.get_train_dev_and_test_dataloader_i2b2_from_loaded_tokenized_data(
    input_ids=input_ids_test,
    attention_masks=attention_masks_test,
    labels_tokenized=labels_tokenized_test,
    data_part="test"
)

Loaded train tokenized data for i2b2 from folder: ./drive/My Drive/MLHC Final Project/final_experiments/tokenized_data_i2b2_126/
Loaded dev tokenized data for i2b2 from folder: ./drive/My Drive/MLHC Final Project/final_experiments/tokenized_data_i2b2_126/
Loaded test tokenized data for i2b2 from folder: ./drive/My Drive/MLHC Final Project/final_experiments/tokenized_data_i2b2_126/


# 2. Modeling & Experiments i2b2 2010

In [0]:
modeling_i2b2 = ModelingI2B2(
    train_dataloader=train_dataloader_cut,
    dev_dataloader=dev_dataloader,
    test_dataloader=test_dataloader,
    batch_size=BATCH_SIZE,
    max_len_sent_without_special_tokens=MAX_LEN_SENT_WITHOUT_SPECIAL_TOKENS,
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
    use_grad_clipping=USE_GRAD_CLIPPING,
    full_finetuning=FULL_FINETUNING,
    use_bi_lstm_crf=USE_BI_LSTM_CRF,
    hidden_size_lstm=HIDDEN_SIZE_LSTM,
    path_clinical_bert_umls_warm_start_model=PATH_CLINICAL_BERT_UMLS_WARM_START_MODEL,
    path_save_clinical_bert_i2b2=PATH_SAVE_CLINICAL_BERT_I2B2_WITH_WARM_START
)
model_ner_clinbert_i2b2 = modeling_i2b2.train_model_i2b2(train_or_not=True)

In [0]:
# Save i2b2 model in a folder
modeling_i2b2.save_model_ner_clinbert_i2b2_with_warm_start(
    model_ner_clinbert_i2b2_with_warm_start=model_ner_clinbert_i2b2,
    tokenizer_clinbert_i2b2_with_warm_start=data_processor_i2b2.tokenizer_clinbert
)