In [None]:
%%capture
!pip install transformers

In [None]:
%%capture
from google.colab import drive
from pandas import DataFrame
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD,AdamW
from tqdm import tqdm

import pandas as pd
import numpy as np
import torch, time, os

drive.mount('/content/drive')
folder = "/content/drive/MyDrive/NERforMedicalRecords/"
bert  = "dbmdz/bert-base-italian-xxl-cased"

In [None]:
# list of file to take into account

#folder+"Corpus/esami.a.iob", folder+"Corpus/esami.b.iob"
datasets = [folder+"Corpus/anamnesi.a.iob", folder+"Corpus/anamnesi.b.iob"]

# label to entity tag to recognize
labels_to_ids = {'B-ACTI': 0, 'B-BODY': 1, 'B-DISO': 2, 'B-DRUG': 3, 'B-SIGN': 4, 'B-TREA': 5, 'I-ACTI': 6,
                 'I-BODY': 7, 'I-DISO': 8, 'I-DRUG': 9, 'I-SIGN': 10, 'I-TREA': 11, 'O': 12}

ids_to_labels = {0: 'B-ACTI', 1: 'B-BODY', 2: 'B-DISO', 3: 'B-DRUG', 4: 'B-SIGN', 5: 'B-TREA', 6: 'I-ACTI',
                 7:'I-BODY', 8:'I-DISO', 9: 'I-DRUG',10: 'I-SIGN', 11: 'I-TREA', 12: 'O'}

tokenizer = AutoTokenizer.from_pretrained(bert)

def align_label(token: list, labels: str):
    
  labels = labels.split() # Trasforming a string of label into array

  # We take into cosideration the previous word to identify if the id is already seen
  previous_id = None
  label_ids = [] # Aligned labels

  # We can all ids in the token and we try to associate to a label  
  for word_idx in token:
  
    # typically when we encounter [CLS]
    if word_idx is None: label_ids.append(-100) 
    else:
      try: # We try to associate a label
        label_ids.append(labels_to_ids[labels[word_idx]])
      except:
        label_ids.append(-100)
    previous_id = word_idx 

  return label_ids

In [None]:
class ConLL_parser:
  def __init__(self, paths: list):
    ## ========== PARAMETERS ==========
    self.tr_size:float = 0.8
    self.vl_size:float = 0.1
    self.ts_size:float = 0.1
    ## ========== PARAMETERS ==========
    self.dataset = [] # List of phrases and labels

    # For each file in a list, we parse a file formatted in ConLL
    for file_ in paths: self.read_conll(file_)

  def read_conll(self, path:str) -> None:

    nphrases = 0
    print("File: ",os.path.basename(path))

    with open(path) as f: 
      phrase,labels = [], [] # lists that contain the words and labels

      for line in f.readlines(): # reads word by word
        line = line.split() # Trasform a line into array

        if len(line) == 0:
          # if the "phrase" contains at least one word we add to dataset
          if (len(phrase)!= 0) & (len(labels)!= 0):
            self.dataset.append((" ".join(phrase)," ".join(labels)))
            nphrases += 1
          phrase,labels = [], []

        elif line[0] != "-DOCSTART-":
          phrase.append(line[0]) # Not lemmatized word
          labels.append(line[3]) # label that corresponding to the word
      print("\tNumber of phrases made: ",nphrases)

  def holdout(self, size: float = 0.5) -> DataFrame:  

    """
    In this phase we transfom a list of pairs (phrase,label) into "holdout" dataframe used to model selection
      1) Create a unique dataframe.
      2) Remove the duplicate (useless for our scope, improve the data dimention)
      3) Sampling the dataset in order to work with a subset of all data avaiable
      4) Dividing the final dataset base on holdout technique.
    """
    dt = pd.DataFrame(self.dataset,columns=["tokens","labels"]).drop_duplicates().sample(frac=size, random_state=42)
    length = len(dt)

    tr = int(self.tr_size * length)
    print("\nTotal number of phrases: ", length, " (tr): ", tr, " (vl): ", int(self.vl_size * length), " (ts): ", int(self.ts_size * length))

    return np.split(dt,[tr, int((self.tr_size + self.vl_size) * length)])

In [None]:
class MyDataset(Dataset):
  # We try preprocess the data as much as possible.
  def __init__(self, dataset: DataFrame):

    self.input_ids, self.mask, self.labels = [], [], []

    for _,row in dataset.iterrows():
      # Apply the tokenization at each row
      token_text = tokenizer(row[0], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
      label_ids = align_label(token_text.word_ids(),row[1])

      # moving the result on GPU
      self.input_ids.append(token_text['input_ids'].squeeze(0).to("cuda:0"))
      self.mask.append(token_text['attention_mask'].squeeze(0).to("cuda:0"))
      self.labels.append(torch.LongTensor(label_ids).to("cuda:0"))

  def __len__(self):
    return len(self.labels)
    
  def __getitem__(self, idx):
    return self.input_ids[idx], self.mask[idx], self.labels[idx]

In [None]:
class BertModel(torch.nn.Module):
    def __init__(self,frozen=True):
        super(BertModel, self).__init__()
        self.bert = AutoModelForMaskedLM.from_pretrained(bert,num_labels=13)
        if frozen:
          for param in self.bert.bert.parameters():
              param.requires_grad = False

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output

In [None]:
class ModelSelection():
  def __init__(self,grid_list:list):
    pass

In [None]:
par1 = {"batch_size": 5, "lr": 5e-3, "max_epoch": 5, "weigth_decay": 1e-2}

In [None]:
def train(model, df_train: DataFrame, df_val: DataFrame, batch_size: int, lr: float,
          max_epochs: int, earlyS: bool = True, cache: bool = True):
  
  # We create a iterator for training e validation dataset
  tr = DataLoader(MyDataset(df_train), batch_size=batch_size)  
  vl = DataLoader(MyDataset(df_val), batch_size=batch_size)
  tr_size, vl_size = len(tr), len(vl)

  earlyS_flag:int = 0
  epoch:int = 0
  previuous_vl:float = float("inf")

  optimizer = AdamW(model.parameters(), lr=lr,weight_decay=1e-2)
  
  while (epoch < max_epochs) & (earlyS_flag <= 1): 

    loss_train, loss_val = 0, 0
    
    # ========== Training Phase ==========
    model.train()
    for input_id, mask, tr_label in tqdm(tr):
      optimizer.zero_grad()
      loss, _ = model(input_id, mask, tr_label)
      loss_train += loss.item()
      loss.backward()
      optimizer.step()      
    # ========== Training Phase ==========

    if cache:
      torch.save(model.state_dict(), "./model.pt")

    # ========== Validation Phase ==========
    model.eval() # Validation phase
    for input_id, mask, val_label in vl:
        loss, _ = model(input_id, mask, val_label)
        loss_val += loss.item()    
    # ========== Validation Phase ==========

    tr_loss, val_loss = (loss_train / tr_size), (loss_val / vl_size)

    ## Early stopping
    if earlyS:
      if (previuous_vl < val_loss): earlyS_flag += 1
      previuous_vl = val_loss

    print(f'Epochs: {epoch + 1} | Loss: {tr_loss: .3f} | Val_Loss: {val_loss: .3f}')
    epoch += 1

In [None]:
df_train, df_val, df_test = ConLL_parser(datasets).holdout(size=1)

File:  anamnesi.a.iob
	Number of phrases made:  72297
File:  anamnesi.b.iob
	Number of phrases made:  72297

Total number of phrases:  48433  (tr):  38746  (vl):  4843  (ts):  4843


In [None]:
model = BertModel().to("cuda:0")
train(model, df_train, df_val, batch_size=5, lr=5e-3, max_epochs=5, cache=True)

In [None]:
# il modello caricato sul drive ha 2 epoche 

In [None]:
"""
Email di questa sera:
1) Parlare dei caratteri accentati
2) Parlare del parser costruito
3) Parlare del model selection (come intendo fare la grid e k-fold)
4) Parlare del fatto di mantere untite tutte le entity
5) Che sto utilizzando solo anamnesi
6) Dei risultati ottenuti
7) Di come effettuare la valutazione 
"""

"""
  Dato un modello allenato valutare le sue prestazioni con 
  1) Matrice di confusione
  2) Precision,Recall,Accuraracy,F1
"""
"""
Da fare:

-) Gestire i caratteri accentati 
-) Implementare il sistema per continuare il traing caricando il modello da file
-) Implementare sistema per la valutazione di singole frasi
-) Implementare K-Fold and Grid search
-) Ulteriori implementazioni isolando solo dei gruppi di entity invece che usarle tutte insieme
"""
"""
Problemi:
1) Carattere ¿ nelle file di esami

"""

In [None]:
! cp -r "/content/model.pt" "/content/drive/MyDrive/NERforMedicalRecords/tmp/model.pt"