In [12]:
!pip install torch fairseq transformers dill fastDamerauLevenshtein tensorboardX accelerate textdistance

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting textdistance
  Downloading textdistance-4.5.0-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.5.0


In [3]:
from datetime import datetime
import numpy as np

from tqdm import tqdm

from data.dataset import TypoDataset, TypoOnlineDataset
from model.char_lm import CharacterLanguageModel, CharTokenizer, CrossEntropyLoss, Trie
from model.parallel import DataParallelModel, DataParallelCriterion
from utils.checkpoint_manager import CheckPointManager
from transformers import AutoConfig, AutoModelForMaskedLM
from fastDamerauLevenshtein import damerauLevenshtein
from torch.utils.tensorboard import SummaryWriter


In [4]:
import os
from model.word_lm import SpellCorrectionModel
from model.char_lm import CharTokenizer
from data.dataset import TypoDataset
import torch

In [5]:
model = SpellCorrectionModel(config_file="/bert_config.json")
typo_tokenizer = CharTokenizer()
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
writer = SummaryWriter(log_dir='logs')


Some weights of the model checkpoint at ./bert/ncbi_bert_base/pytorch_model.bin were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
dataset_train = TypoOnlineDataset("data/mimic3/split", "data/lexicon/lexicon_en.json", model.tokenizer, typo_tokenizer,2)
dataset_val = TypoDataset(os.path.join("data/mimic_synthetic", 'val.tsv'), model.tokenizer, typo_tokenizer)

Read file data/mimic_synthetic/val.tsv... 10000 rows
Parsing rows (10 processes)


100%|██████████| 10000/10000 [00:09<00:00, 1101.42it/s]


In [7]:
'''
Loss function with misspelling penalty
'''
def loss_function(probabilities, correct_label, predicted_spellings, correct_spellings):
    # Calculate cross-entropy loss
    loss = torch.nn.functional.cross_entropy(probabilities.view(-1, model.tokenizer.vocab_size), correct_label.view(-1))
    #Compute fastDamerauLevenshtein distance between predicted and correct spellings
    distance = damerauLevenshtein(' '.join(predicted_spellings), ' '.join(correct_spellings))
    #print(distance)
    total_loss = loss + 0.5 * distance
    
    return total_loss

In [8]:
BATCH_SIZE = 100
dataloader_train = torch.utils.data.DataLoader(dataset_train,
                                                batch_size=BATCH_SIZE,
                                                num_workers=0,
                                                collate_fn=dataset_train.get_collate_fn())

dataloader_val = torch.utils.data.DataLoader(dataset_val,
                                                     batch_size=BATCH_SIZE,
                                                     shuffle=False,
                                                     drop_last=True,
                                                     num_workers=0,
                                                     collate_fn=dataset_val.get_collate_fn())

In [9]:
from torch.optim import AdamW
from accelerate import Accelerator

num_epochs = 5
warmup_proportion = 0.1
num_training_steps = 10000

accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = num_training_steps * num_epochs
warmup_steps = int(warmup_proportion * total_steps)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, dataloader_train, dataloader_val
)


In [10]:
from transformers import get_scheduler

output_dir = "bluebert-finetuned-mimic-v1"
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=warmup_steps, 
    num_training_steps=total_steps
   )


In [11]:
from tqdm.auto import tqdm
import torch
import math



model.train()
criteria = CrossEntropyLoss().to(device)

train_iter = iter(dataloader_train)
MODE = "Train"
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(range(1000))
    global_step = 0

    while global_step < 1000:
        # Training
        if MODE == "Train":
            batch = next(train_iter)
            #print(batch)
            input_ids = batch["context_tokens"]
            attention_mask = batch['context_attention_mask']
            #print(attention_mask.dtype)
            misspelling = batch['typo']
            correct_spelling = batch['correct']
            
            outputs, prediction = model.forward(input_ids, attention_mask, misspelling, correct_spelling)
            
            # Compute the loss
            loss = model.loss
            train_loss += loss.item()
            accelerator.backward(loss)


            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(BATCH_SIZE)
            # if global_step % 2 == 0:
            #     print(f' Training: Epoch {global_step/BATCH_SIZE}, Batch {global_step}, Loss {loss.item()} Did Reduce: {prev > loss.item()}')
            #     prev = loss.item()
                
        # Save and store
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.bert.save_pretrained(output_dir, save_function=accelerator.save)
        unwrapped_model.config.save_pretrained(output_dir, save_function=accelerator.save)
        unwrapped_model.tokenizer.save_pretrained(output_dir, save_function=accelerator.save)

        if accelerator.is_main_process:
            model.bert.save_pretrained(output_dir)
            model.config.save_pretrained(output_dir)
            model.tokenizer.save_pretrained(output_dir)
            lr_scheduler.step()
        global_step += BATCH_SIZE
    train_loss /= (1000/BATCH_SIZE)
    writer.add_scalar('Loss', train_loss, global_step=epoch)

    print(f"Epoch {epoch+1}, train loss: {train_loss:.4f}")

    

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1, train loss: 0.0357


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 2, train loss: 0.0373


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 3, train loss: 0.0350


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 4, train loss: 0.0376


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 5, train loss: 0.0350


In [59]:
output_dir

'bluebert-finetuned-mimic-v1'

In [None]:
output_dir = "bluebert-finetuned-mimic"
model = SpellCorrectionModel(NCBI_BERT = output_dir, config_file= "/config.json")
model.to(device)

SpellCorrectionModel(
  (bert): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [52]:
target_ids = torch.tensor(model.tokenizer.convert_tokens_to_ids(correct_spelling))
label[1]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='mps:0')

In [51]:
label.shape, target_ids.shape, input_ids.shape

(torch.Size([100, 130]), torch.Size([100]), torch.Size([100, 128]))

In [37]:
# Evaluation
val_iter = iter(dataloader_val)
global_step = 0


model.eval()
progress_bar = tqdm(range(num_training_steps))
global_total = 0
global_correct = 0
while global_step < num_training_steps:
    # Evaluation
    try:
        batch_val = next(val_iter)
    except StopIteration:
        val_iter = iter(dataloader_val)
        batch_val = next(val_iter)
    input_ids = batch_val["context_tokens"]
    attention_mask = batch_val['context_attention_mask']
    misspelling = batch_val['typo']
    correct_spelling = batch_val['correct']
    
    with torch.no_grad():
        outputs,  prediction, label = model.forward(input_ids, attention_mask, misspelling)
    global_total += len(correct_spelling)
    current_correct = 0
    for index in range(len(correct_spelling)):
        if correct_spelling[index] == prediction[index]:
            global_correct +=1
        
    loss = loss_function(outputs, label, misspelling, correct_spelling).to(device)
    loss = loss.requires_grad_(True)
    accelerator.backward(loss)

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
    if global_step %50 == 0:
        print(f'Total/Correct = {global_total} / {global_correct}')
    global_step+=1

  0%|          | 0/10000 [00:00<?, ?it/s]

TypeError: SpellCorrectionModel.forward() missing 1 required positional argument: 'correct_spelling'

In [242]:
from transformers import pipeline
model_trained = AutoModelForMaskedLM.from_pretrained("./bluebert-finetuned-mimic")

mask_filler = pipeline(
    "fill-mask", model=output_dir, top_k= 10
)

preds = mask_filler("AORTA: Mildly dilated aortic [MASK]. Focal calcifications in aortic root.")

for pred in preds:
    print(f">>> {pred['sequence']}")
    

>>> aorta : mildly dilated aortic root. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic arch. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic diameter. focal calcifications in aortic root.
>>> aorta : mildly dilated aorticrta. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic ascending. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic cavity. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic ao. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic roots. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic artery. focal calcifications in aortic root.
>>> aorta : mildly dilated aortic valve. focal calcifications in aortic root.


In [80]:
30522/14

2180.1428571428573

In [None]:
damerauLevenshtein("she", "moom")

0.0

In [None]:
model.tokenizer.cls_token_id

101

In [None]:
dataset_iter = iter(train_dataloader)


In [None]:
dataset_train.bert_tokenizer

BertTokenizerFast(name_or_path='/Users/tamuneke/UIUC/DLH/cim-misspelling-dlh-repro/bert/ncbi_bert_base/', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)