In [1]:
%%capture
!pip install transformers

In [2]:
%%capture
from google.colab import drive
from pandas import DataFrame
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD,Adam
from tqdm import tqdm

import pandas as pd
import numpy as np
import torch, time, os

drive.mount('/content/drive')
folder = "/content/drive/MyDrive/NERforMedicalRecords/"
bert  = "dbmdz/bert-base-italian-xxl-cased"


In [3]:
tokenizer = AutoTokenizer.from_pretrained(bert)

In [4]:
# list of file to take into account
datasets = [folder+"Corpus/anamnesi.a.iob", folder+"Corpus/anamnesi.b.iob", folder+"Corpus/esami.a.iob", folder+"Corpus/esami.b.iob"]

# label to entity tag to recognize
labels_to_ids = {'B-ACTI': 0, 'B-BODY': 1, 'B-DISO': 2, 'B-DRUG': 3, 'B-SIGN': 4, 'B-TREA': 5, 'I-ACTI': 6,
                 'I-BODY': 7, 'I-DISO': 8, 'I-DRUG': 9, 'I-SIGN': 10, 'I-TREA': 11, 'O': 12}

In [5]:
class ConLL_parser:
  def __init__(self, paths: list):
  
    self.dataset = [] # List of phrases and labels

    # For each file in a list, we parse a file formatted in ConLL
    for file_ in paths: self.read_conll(file_)

  def read_conll(self, path:str) -> None:

    nphrases = 0
    print("File: ",os.path.basename(path))

    with open(path) as f: 
      phrase,labels = [], [] # lists that contain the words and labels

      for line in f.readlines(): # reads word by word
        line = line.split() # Trasform a line into array

        if len(line) == 0:
          # if the "phrase" contains at least one word we add to dataset
          if (len(phrase)!= 0) & (len(labels)!= 0):
            self.dataset.append((" ".join(phrase)," ".join(labels)))
            nphrases += 1
          phrase,labels = [], []

        elif line[0] != "-DOCSTART-":
          phrase.append(line[0]) # Not lemmatized word
          labels.append(line[3]) # label that corresponding to the word
      print("\tNumber of phrases made: ",nphrases)

  def holdout(self, size: float, tr: float = 0.8, vl: float = 0.10, ts:float = 0.10) -> DataFrame:  

    """
    In this phase we transfom a list of pairs (phrase,label) into "holdout" dataframe used to model selection
      1) Create a unique dataframe.
      2) Remove the duplicate (useless for our scope, improve the data dimention)
      3) Sampling the dataset in order to work with a subset of all data avaiable
      4) Dividing the final dataset base on holdout technique.
    """

    dt = pd.DataFrame(self.dataset,columns=["tokens","labels"]).drop_duplicates().sample(frac=size, random_state=42)
    length = len(dt)

    return np.split(dt,[int(tr * length), int((tr + vl) * length)])

In [6]:
class MyDataset(Dataset):
  # We try preprocess the data as much as possible.
  def __init__(self, dataset: DataFrame):

    self.input_ids, self.mask, self.labels = [], [], []

    for _,row in dataset.iterrows():
      # Apply the tokenization at each row
      token_text = tokenizer(row[0], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
      label_ids = self.align_label(token_text.word_ids(),row[1])

      # moving the result on GPU
      self.input_ids.append(token_text['input_ids'].squeeze(0).to("cuda:0"))
      self.mask.append(token_text['attention_mask'].squeeze(0).to("cuda:0"))

      self.labels.append(torch.LongTensor(label_ids).to("cuda:0"))

  def __len__(self):
    return len(self.labels)
    
  def __getitem__(self, idx):
    return self.input_ids[idx], self.mask[idx], self.labels[idx]

  def align_label(self, token: list, labels: str):
    
    labels = labels.split() # Trasforming a string of label into array

    # We take into cosideration the previous word to identify if the id is already seen
    previous_id = None
    label_ids = [] # Aligned labels

    # We can all ids in the token and we try to associate to a label  
    for word_idx in token:
  
      if word_idx is None: # typically when we encounter [CLS]
        label_ids.append(-100)
      else:
        try: # We try to associate a label
          label_ids.append(labels_to_ids[labels[word_idx]])
        except:
          label_ids.append(-100)
  
      previous_id = word_idx 

    return label_ids

In [7]:
class BertModel(torch.nn.Module):

    def __init__(self,frozen=True):
        super(BertModel, self).__init__()
        self.bert = AutoModelForMaskedLM.from_pretrained(bert,num_labels=13)
        if frozen:
          for param in self.bert.bert.parameters():
              param.requires_grad = False

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output

In [8]:
def train(model, df_train: DataFrame, df_val: DataFrame, batch_size: int, lr: float, epochs: int):
  
  # We create a iterator for training e validation dataset
  tr = DataLoader(MyDataset(df_train), batch_size=batch_size)
  vl = DataLoader(MyDataset(df_val), batch_size=batch_size)

  tr_size, vl_size = len(tr), len(vl)

  optimizer = SGD(model.parameters(), lr=lr)

  for e in range(epochs):  

    acc_train, loss_train = 0, 0
    acc_val, loss_val = 0, 0

    model.train() # Traininig phase
    for input_id, mask, tr_label in tqdm(tr):

      optimizer.zero_grad()
      loss, logits = model(input_id, mask, tr_label)

      for i in range(logits.shape[0]):
        logits_clean = logits[i][tr_label[i] != -100]
        label_clean = tr_label[i][tr_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        acc_train += acc
        loss_train += loss.item()

      loss.backward()
      optimizer.step()
      
    torch.save(model.state_dict(), "./model.pt")

    model.eval() # Validation phase
    for input_id, mask, val_label in tqdm(vl):
      
      loss, logits = model(input_id, mask, val_label)

      for i in range(logits.shape[0]):
        logits_clean = logits[i][val_label[i] != -100]
        label_clean = val_label[i][val_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        acc_val += acc
        loss_val += loss.item()
      
    tr_accuracy, tr_loss = (acc_train / tr_size), (loss_train / tr_size)
    val_accuracy, val_loss = (acc_val / vl_size), (loss_val / vl_size)
     
    print(f'Epochs: {e + 1} | Loss: {tr_loss: .3f} | Accuracy: {tr_accuracy: .3f} | Val_Loss: {val_loss: .3f} | Accuracy: {val_accuracy: .3f}')

    

In [None]:
parser = ConLL_parser(datasets)
df_train,df_val, _ = parser.holdout(size=0.5)

model = BertModel().to("cuda:0")
train(model,df_train, df_val,2,5e-3,5)

100%|██████████| 33510/33510 [1:00:48<00:00,  9.19it/s]
100%|██████████| 4189/4189 [06:35<00:00, 10.60it/s]


Epochs: 1 | Loss:  0.325 | Accuracy:  0.916 | Val_Loss:  0.269 | Accuracy:  0.916


 87%|████████▋ | 29206/33510 [52:51<07:48,  9.18it/s]