# Environment set up

In [None]:
!pip install transformers seqeval[gpu]

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=35c70fa4b61f4eac97a33f0b9c694a897e111050459f94f7d27cbb8f7a90c94c
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from google.colab import drive

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/.shortcut-targets-by-id/1vdEcgdXIfpnlORVlPsJtHUmKXSAqr69R/6.8611 Research Project/Colab Notebooks


In [None]:
ls

 BC5CDR-D_devel_1.csv            [0m[01;34mllm-annotations[0m/        zero-shot-bc5cdr-chem.pynb
 BC5CDR-D_devel_2.csv           ' NER with BERT.ipynb'  'zero_shot[FASTER].ipynb'
 Data-cleaning.ipynb             openai-test.ipynb       zero-shot.pynb
 [01;34mdevel_gpt_generated_datasets[0m/   retry_prompts.gsheet
 intrinsic_eval.ipynb            tokens_labels.csv


# Downloading and preprocessing the data


In [None]:
def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  print(df.head())
  return df

def load_csv_dataset(file_path):
  """
  Loads a csv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, header=None, engine='python')
  df.columns = ['token', 'label']
  print(df.head())
  return df

In [None]:
NCBI_devel = 'llm-annotations/datasets/NCBI-disease/devel.tsv'
NCBI_devel_df = load_tsv_dataset(NCBI_devel)
NCBI_test = 'llm-annotations/datasets/NCBI-disease/test.tsv'
NCBI_test_df = load_tsv_dataset(NCBI_test)

JNLPBA_devel = 'llm-annotations/datasets/JNLPBA/devel.tsv'
JNLPBA_devel_df = load_tsv_dataset(JNLPBA_devel)
JNLPBA_test = 'llm-annotations/datasets/JNLPBA/test.tsv'
JNLPBA_test_df = load_tsv_dataset(JNLPBA_test)

BC5CDR_D_devel = 'llm-annotations/datasets/BC5CDR-disease/devel.tsv'
BC5CDR_D_devel_df = load_tsv_dataset(BC5CDR_D_devel)
BC5CDR_D_test = 'llm-annotations/datasets/BC5CDR-disease/test.tsv'
BC5CDR_D_test_df = load_tsv_dataset(BC5CDR_D_test)

BC5CDR_C_devel = 'llm-annotations/datasets/BC5CDR-chem/devel.tsv'
BC5CDR_C_devel_df = load_tsv_dataset(BC5CDR_C_devel)
BC5CDR_C_test = 'llm-annotations/datasets/BC5CDR-chem/test.tsv'
BC5CDR_C_test_df = load_tsv_dataset(BC5CDR_C_test)

BC2GM_devel = 'llm-annotations/datasets/BC2GM/devel.tsv'
BC2GM_devel_df = load_tsv_dataset(BC2GM_devel)
BC2GM_test = 'llm-annotations/datasets/BC2GM/test.tsv'
BC2GM_test_df = load_tsv_dataset(BC2GM_test)

In [None]:
def tokens_to_sentences(labeled_df):
  tokens = labeled_df['token'].tolist()
  labels = labeled_df['label'].tolist()

  sentences = [] # List[str]: each element is a sentence string
  sentence_labels = [] # List[str]: each element is a string of comma-separated labels corresponding to a sentence

  current_sentence = []
  current_sentence_labels = []
  for t, l in zip(tokens, labels):
    current_sentence.append(str(t))
    current_sentence_labels.append(l)

    if t == '.':
      sentences.append(' '.join(current_sentence))
      sentence_labels.append(','.join(current_sentence_labels))
      current_sentence = []
      current_sentence_labels = []

  return sentences, sentence_labels



Let's have a look at the different NER tags.

We create 2 dictionaries: one that maps individual tags to indices, and one that maps indices to their individual tags. This is necessary in order to create the labels (as computers work with numbers = indices, rather than words = tags) - see further in this notebook.

In [None]:
label2id = {'B': 0, 'I': 1, 'O': 2}
id2label = {0: 'B', 1: 'I', 2: '0'}

print(label2id)
print(id2label)

{'B': 0, 'I': 1, 'O': 2}
{0: 'B', 1: 'I', 2: '0'}


Now, we load in all the dataset sentences and their corresponding tokens into 5 different dataframes.

In [None]:
sentences, labels = tokens_to_sentences(NCBI_devel_df)
NCBI_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(NCBI_train_data))

sentences, labels = tokens_to_sentences(NCBI_test_df)
NCBI_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(NCBI_test_data))


Number of train sentences:  1027
Number of train sentences:  1042


In [None]:
sentences, labels = tokens_to_sentences(JNLPBA_devel_df)
JNLPBA_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(JNLPBA_train_data))

sentences, labels = tokens_to_sentences(JNLPBA_test_df)
JNLPBA_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(JNLPBA_test_data))

Number of train sentences:  4257
Number of train sentences:  4288


In [None]:
sentences, labels = tokens_to_sentences(BC5CDR_D_devel_df)
BC5CDR_D_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_D_train_data))

sentences, labels = tokens_to_sentences(BC5CDR_D_test_df)
BC5CDR_D_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_D_test_data))

Number of train sentences:  5918
Number of train sentences:  6488


In [None]:
sentences, labels = tokens_to_sentences(BC5CDR_C_devel_df)
BC5CDR_C_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_C_train_data))

sentences, labels = tokens_to_sentences(BC5CDR_C_test_df)
BC5CDR_C_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_C_test_data))

Number of train sentences:  5918
Number of train sentences:  6488


In [None]:
sentences, labels = tokens_to_sentences(BC2GM_devel_df)
BC2GM_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC2GM_train_data))

sentences, labels = tokens_to_sentences(BC2GM_test_df)
BC2GM_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC2GM_test_data))

Number of train sentences:  3123
Number of train sentences:  4741


Let's verify that a random sentence and its corresponding tags are correct:

In [None]:
sentence = NCBI_train_data.iloc[16].sentence
sentence

'The risk for ovarian cancer was 2 .'

In [None]:
NCBI_train_data.iloc[16].word_labels

'O,O,O,B,I,O,O,O'

#### **Preparing the dataset and dataloader**

Now that our data is preprocessed, we can turn it into PyTorch tensors such that we can provide it to the model. Let's start by defining some key variables that will be used later on in the training/evaluation process:

In [None]:
from transformers import AutoTokenizer

MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
#tokenizer = #BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

A tricky part of NER with with models that have architecure similar to BERT, is that these models rely on **wordpiece tokenization**, rather than word tokenization. This means that we should also define the labels at the wordpiece-level, rather than the word-level!

For example, if you have word like "Washington" which is labeled as "b-gpe", but it gets tokenized to "Wash", "##ing", "##ton", then we will have to propagate the word’s original label to all of its wordpieces: "b-gpe", "b-gpe", "b-gpe". The model should be able to produce the correct labels for each individual wordpiece. The function below (taken from [here](https://github.com/chambliss/Multilingual_NER/blob/master/python/utils/main_utils.py#L118)) implements this.






In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """
    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
max = 0

datasets = [NCBI_train_data, JNLPBA_train_data, BC5CDR_D_train_data, BC5CDR_C_train_data, BC2GM_train_data]
for dataset in datasets:
  for i, row in NCBI_data.iterrows():
    tokenized_sentence, labels = tokenize_and_preserve_labels(row['sentence'], row['word_labels'], tokenizer)
    if len(tokenized_sentence) != len(labels):
      print('AYOOO')
      print(dataset, i)
    if len(tokenized_sentence) > max:
      max = len(tokenized_sentence)

max

102

Note that this is a **design decision**. You could also decide to only label the first wordpiece of each word and let the model only learn this (this is what was done in the original BERT paper, see Github discussion [here](https://github.com/huggingface/transformers/issues/64#issuecomment-443703063)). Another design decision could be to give the first wordpiece of each word the original word label, and then use the label “X” for all subsequent subwords of that word.

All of them lead to good performance.

Next, we define a regular PyTorch [dataset class](https://pytorch.org/docs/stable/data.html) (which transforms examples of a dataframe to PyTorch tensors). Here, each sentence gets tokenized, the special tokens that BERT expects are added, the tokens are padded or truncated based on the max length of the model, the attention mask is created and the labels are created based on the dictionary which we defined above.

For more information about BERT's inputs, see [here](https://huggingface.co/transformers/glossary.html).  

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

Now, based on the class we defined above, we can create 2 datasets, one for training and one for testing. Let's use a 80/20 split:

In [None]:
dataset_names = ['NCBI', 'JNLPBA', 'BC5CDR_D', 'BC5CDR_C', 'BC2GM']

train_datasets = [
    NCBI_train_data,
    JNLPBA_train_data,
    BC5CDR_D_train_data,
    BC5CDR_C_train_data,
    BC2GM_train_data
    ]

test_datasets = [
    NCBI_test_data,
    JNLPBA_test_data,
    BC5CDR_D_test_data,
    BC5CDR_C_test_data,
    BC2GM_test_data
    ]

datasets = {}
for i in range(5):
  datasets[dataset_names[i]] = [dataset(train_datasets[i], tokenizer, MAX_LEN), dataset(test_datasets[i], tokenizer, MAX_LEN)]

for name, sets in datasets.items():
  print(f"{name} TRAIN Dataset: {sets[0].len}")
  print(f"{name} TEST Dataset: {sets[1].len}")
  print()

NCBI TRAIN Dataset: 1027
NCBI TEST Dataset: 1042

JNLPBA TRAIN Dataset: 4257
JNLPBA TEST Dataset: 4288

BC5CDR_D TRAIN Dataset: 5918
BC5CDR_D TEST Dataset: 6488

BC5CDR_C TRAIN Dataset: 5918
BC5CDR_C TEST Dataset: 6488

BC2GM TRAIN Dataset: 3123
BC2GM TEST Dataset: 4741



Let's have a look at the first training example:

In [None]:
datasets['NCBI'][0][1]

{'ids': tensor([    2, 15823,  4295,  1682, 17107,  1810,  4792,  1725,  2261,  2371,
          1685, 11196,  3418,  1690,  5622,  2310,    17,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

Let's verify that the input ids and corresponding targets are correct:

In [None]:
datasets['NCBI'][0][1]["ids"]

tensor([    2, 15823,  4295,  1682, 17107,  1810,  4792,  1725,  2261,  2371,
         1685, 11196,  3418,  1690,  5622,  2310,    17,     3,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(datasets['NCBI'][0][1]["ids"][:30]), datasets['NCBI'][0][1]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       0
germline    0
mutations   0
in          0
brca1       0
are         0
responsible  0
for         0
most        0
cases       0
of          0
inherited   B
breast      I
and         I
ovarian     I
cancer      I
.           0
[SEP]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0


Now, let's define the corresponding PyTorch dataloaders:

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [None]:
dataset_loaders = {}

for key, sets in datasets.items():
  dataset_loaders[key] = [DataLoader(sets[0], **train_params), DataLoader(sets[1], **train_params)]

#### **Defining the model**

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
model = AutoModel.from_pretrained('michiyasunaga/BioLinkBERT-base')Here we define the model, BertForTokenClassification, and load it with the pretrained weights of "bert-base-uncased". The only thing we need to additionally specify is the number of labels (as this will determine the architecture of the classification head).

Note that only the base layers are initialized with the pretrained weights. The token classification head of top has just randomly initialized weights, which we will train, together with the pretrained weights, using our labelled dataset. This is also printed as a warning when you run the code cell below.

Then, we move the model to the GPU.

#### **Training the model**

Before training the model, let's perform a sanity check, which I learned thanks to Andrej Karpathy's wonderful [cs231n course](http://cs231n.stanford.edu/) at Stanford (see also his [blog post about debugging neural networks](http://karpathy.github.io/2019/04/25/recipe/)). The initial loss of your model should be close to -ln(1/number of classes) = -ln(1/3) = 1.10.

Why? Because we are using cross entropy loss. The cross entropy loss is defined as -ln(probability score of the model for the correct class). In the beginning, the weights are random, so the probability distribution for all of the classes for a given token will be uniform, meaning that the probability for the correct class will be near 1/3. The loss for a given token will thus be -ln(1/3).

Let's verify this:



In [None]:
# Defining the training function for tuning the bert model
def train(epoch, training_loader):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
  # training_loader = loaders[0]
  # testing_loader = loaders[1]
  # ids = datasets[key][0][0]["ids"].unsqueeze(0)
  # mask = datasets[key][0][0]["mask"].unsqueeze(0)
  # targets = datasets[key][0][0]["targets"].unsqueeze(0)
  # ids = ids.to(device)
  # mask = mask.to(device)
  # targets = targets.to(device)
  # outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
  # initial_loss = outputs[0]
  # print(initial_loss)
  # tr_logits = outputs[1]
  # print(tr_logits.shape)
  # print()

#### **Evaluating the model**

Now that we've trained our model, we can evaluate its performance on the test set. Note that here, no gradient updates are performed, the model just outputs its logits.

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

As we can see below, performance is quite good! Accuracy on the test test is > 93%.

However, the accuracy metric is misleading, as a lot of labels are "outside" (O), even after omitting predictions on the [PAD] tokens. What is important is looking at the precision, recall and f1-score of the individual tags. For this, we use the seqeval Python library:

#### **Saving the model for future use**

Finally, let's save the model and tokenizer files such that we can easily re-use them later on. There are 2 options:

* you can save everything locally, simply by calling `model.save_pretrained()` and `tokenizer.save_pretrained()`, providing a directory path as argument.
* you can push the files to the [HuggingFace hub](https://huggingface.co/). This way, you can share your model with the community/your colleagues. All files will be tracked by git, as each model on the hub has its own git repo.

Both options allow to re-use the model/tokenizer using the `from_pretrained()` method. Here we'll do the latter.

To upload a model to the hub, 2 things need to be setup:
* install git-LFS, which is used by the hub
* set up authentication token


In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# FINE-TUNING PIPELINE

In this pipeline, we load in the model to the GPU, train it on the training datasets, evaluate it on the validation set and provide a confusion matrix report, and then push the fine-tuned model to our [HuggingFace page](https://huggingface.co/68611-llm-annotation-group).

---



In [None]:
for key, loaders in dataset_loaders.items():

  print(f'TRAINING ON {key}.... \n')

  training_loader = loaders[0]
  testing_loader = loaders[1]

  tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
  model = AutoModelForTokenClassification.from_pretrained('michiyasunaga/BioLinkBERT-base',
                                    num_labels=len(id2label),
                                    id2label=id2label,
                                    label2id=label2id)
  model.to(device)
  optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

  for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch, training_loader)

  print(f'EVALUATING {key}.... \n')
  labels, predictions = valid(model, testing_loader)
  print()
  print('-----------------------------------------------------------------')
  print(classification_report([labels], [predictions]))
  print('-----------------------------------------------------------------')
  print()
  print(f'WRITING {key} to HUGGING FACE.... \n')

  model_name = f"BioLinkBERT-base-NER-{key}-HA"

  # upload files to the hub
  tokenizer.push_to_hub(
      organization='68611-llm-annotation-group',
      repo_id=model_name,
      commit_message="Add tokenizer",
      use_temp_dir=True,
  )
  model.push_to_hub(
      organization='68611-llm-annotation-group',
      repo_id=model_name,
      commit_message="Add model",
      use_temp_dir=True,
  )
  print(f'{key} DONE!')
  print('*******************************************************')
  print()


TRAINING ON JNLPBA.... 



Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1
Training loss per 100 training steps: 1.3918479681015015
Training loss per 100 training steps: 0.2511055135712175
Training loss per 100 training steps: 0.1659005278193239
Training loss per 100 training steps: 0.13006509960848728
Training loss per 100 training steps: 0.10953751762636804
Training loss per 100 training steps: 0.09624150668841576
Training loss per 100 training steps: 0.08712061478007901
Training loss per 100 training steps: 0.08066094670107272
Training loss per 100 training steps: 0.07642804282820886
Training loss per 100 training steps: 0.07170314149859751
Training loss per 100 training steps: 0.06836136931541388
Training loss epoch: 0.0663433231011103
Training accuracy epoch: 0.92054262353292
Training epoch: 2
Training loss per 100 training steps: 0.02205314114689827
Training loss per 100 training steps: 0.03103242619516383
Training loss per 100 training steps: 0.030655873944028397
Training loss per 100 training steps: 0.031153846671340673
Training loss



              precision    recall  f1-score   support

           _       0.75      0.87      0.81      1540

   micro avg       0.75      0.87      0.81      1540
   macro avg       0.75      0.87      0.81      1540
weighted avg       0.75      0.87      0.81      1540

-----------------------------------------------------------------

WRITING JNLPBA to HUGGING FACE.... 





model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

JNLPBA DONE!
*******************************************************

TRAINING ON BC5CDR_D.... 



Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1
Training loss per 100 training steps: 1.1921664476394653
Training loss per 100 training steps: 0.19504918755576162
Training loss per 100 training steps: 0.1237466214919713
Training loss per 100 training steps: 0.09465408343295768
Training loss per 100 training steps: 0.07793906165932554
Training loss per 100 training steps: 0.06675603235523382
Training loss per 100 training steps: 0.058775109104105515
Training loss per 100 training steps: 0.052916566412510635
Training loss per 100 training steps: 0.04807612536964559
Training loss per 100 training steps: 0.04438873786855485
Training loss per 100 training steps: 0.04162961086067562
Training loss per 100 training steps: 0.039022430729439866
Training loss per 100 training steps: 0.03689561787272317
Training loss per 100 training steps: 0.03491049587000362
Training loss per 100 training steps: 0.0334259916668117
Training loss epoch: 0.032369594180616676
Training accuracy epoch: 0.962701069396773
Training epoch: 2
Training 



              precision    recall  f1-score   support

           _       0.74      0.90      0.81      1068

   micro avg       0.74      0.90      0.81      1068
   macro avg       0.74      0.90      0.81      1068
weighted avg       0.74      0.90      0.81      1068

-----------------------------------------------------------------

WRITING BC5CDR_D to HUGGING FACE.... 





model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

BC5CDR_D DONE!
*******************************************************

TRAINING ON BC5CDR_C.... 



Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1
Training loss per 100 training steps: 1.6407212018966675
Training loss per 100 training steps: 0.26686726999238575
Training loss per 100 training steps: 0.16135401115636921
Training loss per 100 training steps: 0.11922961963414898
Training loss per 100 training steps: 0.09497336642305712
Training loss per 100 training steps: 0.08014321427080341
Training loss per 100 training steps: 0.06874551738344269
Training loss per 100 training steps: 0.060616411279339595
Training loss per 100 training steps: 0.055379612633223635
Training loss per 100 training steps: 0.05015033614831035
Training loss per 100 training steps: 0.04619440349766968
Training loss per 100 training steps: 0.04286283658196529
Training loss per 100 training steps: 0.04006706548701169
Training loss per 100 training steps: 0.03772019185673454
Training loss per 100 training steps: 0.03545911860141071
Training loss epoch: 0.033999689826521566
Training accuracy epoch: 0.9686870152528538
Training epoch: 2
Trainin



              precision    recall  f1-score   support

           _       0.93      0.97      0.95      2968

   micro avg       0.93      0.97      0.95      2968
   macro avg       0.93      0.97      0.95      2968
weighted avg       0.93      0.97      0.95      2968

-----------------------------------------------------------------

WRITING BC5CDR_C to HUGGING FACE.... 





model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

BC5CDR_C DONE!
*******************************************************

TRAINING ON BC2GM.... 



Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1
Training loss per 100 training steps: 1.226596474647522
Training loss per 100 training steps: 0.25544786681928255
Training loss per 100 training steps: 0.1593563329956647
Training loss per 100 training steps: 0.12100893995443056
Training loss per 100 training steps: 0.10052947674824524
Training loss per 100 training steps: 0.08751540510318503
Training loss per 100 training steps: 0.07885588991599848
Training loss per 100 training steps: 0.07186227890771792
Training loss epoch: 0.06709787002216104
Training accuracy epoch: 0.9295442946203782
Training epoch: 2
Training loss per 100 training steps: 0.022347087040543556
Training loss per 100 training steps: 0.02014253445495252
Training loss per 100 training steps: 0.021057491369119184
Training loss per 100 training steps: 0.02121947010773299
Training loss per 100 training steps: 0.02087391652410706
Training loss per 100 training steps: 0.020602291996429066
Training loss per 100 training steps: 0.020215552622923464
Training



              precision    recall  f1-score   support

           _       0.81      0.89      0.85      2445

   micro avg       0.81      0.89      0.85      2445
   macro avg       0.81      0.89      0.85      2445
weighted avg       0.81      0.89      0.85      2445

-----------------------------------------------------------------

WRITING BC2GM to HUGGING FACE.... 





model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

BC2GM DONE!
*******************************************************



Note that there's another way to easily perform quick inference with a trained model: the [pipeline API](https://huggingface.co/docs/transformers/main_classes/pipelines). The pipeline API abstracts away all the complexity for you (basically performing what we did above). Here, we'll use the [TokenClassificationPipeline](https://huggingface.co/docs/transformers/v4.17.0/en/main_classes/pipelines#transformers.TokenClassificationPipeline) since that's the task we're doing, and we provide a model and tokenizer.