# Environment set up

In [None]:
!pip install transformers seqeval[gpu]

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m610.3 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=a4cdaad9b8a4149da4d6e1dc8117816d3e92f6db388565b95becb3eab9ab35cb
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from google.colab import drive

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/.shortcut-targets-by-id/1vdEcgdXIfpnlORVlPsJtHUmKXSAqr69R/6.8611 Research Project/Colab Notebooks


In [None]:
ls

 BC5CDR-D_devel_1.csv                Intrinsic_token.ipynb
 BC5CDR-D_devel_2.csv                [0m[01;34mllm-annotations[0m/
 Data-cleaning.ipynb                ' NER with BERT.ipynb'
 [01;34mdevel_gpt_generated_datasets[0m/       openai-test.ipynb
 Fine-Tuning-Few-Shot.ipynb          retry_prompts.gsheet
 Fine-Tuning-Human-Annotated.ipynb   RW-Fine-Tuning-Human-Annotated.ipynb
 Fine-Tuning-One-Shot.ipynb          [01;34msft_training_data[0m/
 Fine-Tuning-Zero-Shot.ipynb         TEST_LABEL_BUGS.ipynb
 GPT-Finetuning.ipynb                tokens_labels.csv
 [01;34mintrinsic_data[0m/                     Untitled
 intrinsic_eval_entity.ipynb         zero-shot-bc5cdr-chem.pynb
 intrinsic_eval.ipynb               'zero_shot[FASTER].ipynb'
'Intrinsic Eval Precision.ipynb'     zero-shot.pynb
 Intrinsic_one.ipynb


# Downloading and preprocessing zero, one, few-shot data


In [None]:
def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  print(df.head())
  return df

def load_csv_dataset(file_path):
  """
  Loads a csv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, header=None, engine='python')
  df.columns = ['token', 'label']
  print(df.head())
  return df

In [None]:
def clean(df):
  for i, row in df.iterrows():
    if row['label'] == 'B' or row['label'] == 'I' or row['label'] == 'O':
      continue
    elif 'B' in row['label']:
      df.loc[i, 'label'] = 'B'
    elif 'I' in row['label']:
      df.loc[i, 'label'] = 'I'
    else:
      df.loc[i, 'label'] = 'O'
  return df

In [None]:
# CHOOSE whether to train the BPLM on zero, one, or few-shot data using the variable 'shot_choice'

shot_choice = 'few_shot' # All possibilities: ['zero_shot', 'one_shot', 'few_shot']

NCBI_devel = f'devel_gpt_generated_datasets/{shot_choice}/NCBI-disease-devel.csv'
NCBI_devel_df = pd.read_csv(NCBI_devel)
NCBI_devel_df = clean(NCBI_devel_df)
print(NCBI_devel_df.head())
NCBI_test = 'llm-annotations/datasets/NCBI-disease/test.tsv'
NCBI_test_df = load_tsv_dataset(NCBI_test)

JNLPBA_devel = f'devel_gpt_generated_datasets/{shot_choice}/JNLPBA-devel - JNLPBA-devel.csv'
JNLPBA_devel_df = pd.read_csv(JNLPBA_devel)
JNLPBA_devel_df = JNLPBA_devel_df.drop([40396, 40397, 40398]) # remove nan
JNLPBA_devel_df = JNLPBA_devel_df.drop([59592, 59593, 59594]) # remove nan
JNLPBA_devel_df = clean(JNLPBA_devel_df)
print(JNLPBA_devel_df.head())
JNLPBA_test = 'llm-annotations/datasets/JNLPBA/test.tsv'
JNLPBA_test_df = load_tsv_dataset(JNLPBA_test)

BC5CDR_D_devel = f'devel_gpt_generated_datasets/{shot_choice}/BC5CDR-disease-devel.csv'
BC5CDR_D_devel_df = pd.read_csv(BC5CDR_D_devel_df)
BC5CDR_D_devel_df = clean(BC5CDR_D_devel_df)
print(BC5CDR_D_devel_df.head())
BC5CDR_D_test = 'llm-annotations/datasets/BC5CDR-disease/test.tsv'
BC5CDR_D_test_df = load_tsv_dataset(BC5CDR_D_test)

BC5CDR_C_devel = f'devel_gpt_generated_datasets/{shot_choice}/BC5CDR-chem-devel.csv'
BC5CDR_D_devel_df = pd.read_csv(BC5CDR_C_devel)
BC5CDR_D_devel_df = clean(BC5CDR_D_devel_df)
print(BC5CDR_D_devel_df.head())
BC5CDR_C_test = 'llm-annotations/datasets/BC5CDR-chem/test.tsv'
BC5CDR_C_test_df = load_tsv_dataset(BC5CDR_C_test)

BC2GM_devel = f'devel_gpt_generated_datasets/{shot_choice}/BC2GM-devel.csv'
BC2GM_devel_df = pd.read_csv(BC2GM_devel)
BC2GM_devel_df = clean(BC2GM_devel_df)
print(BC2GM_devel_df.head())
BC2GM_test = 'llm-annotations/datasets/BC2GM/test.tsv'
BC2GM_test_df = load_tsv_dataset(BC2GM_test)

In [None]:
def tokens_to_sentences(labeled_df):
  tokens = labeled_df['token'].tolist()
  labels = labeled_df['label'].tolist()

  sentences = [] # List[str]: each element is a sentence string
  sentence_labels = [] # List[str]: each element is a string of comma-separated labels corresponding to a sentence

  current_sentence = []
  current_sentence_labels = []
  for t, l in zip(tokens, labels):
    current_sentence.append(str(t))
    current_sentence_labels.append(l)

    if t == '.':
      sentences.append(' '.join(current_sentence))
      sentence_labels.append(','.join(current_sentence_labels))
      current_sentence = []
      current_sentence_labels = []

  return sentences, sentence_labels


Let's have a look at the different NER tags.

We create 2 dictionaries: one that maps individual tags to indices, and one that maps indices to their individual tags. This is necessary in order to create the labels (as computers work with numbers = indices, rather than words = tags) - see further in this notebook.

In [None]:
label2id = {'B': 0, 'I': 1, 'O': 2}
id2label = {0: 'B', 1: 'I', 2: '0'}

print(label2id)
print(id2label)

{'B': 0, 'I': 1, 'O': 2}
{0: 'B', 1: 'I', 2: '0'}


In [None]:
sentences, labels = tokens_to_sentences(NCBI_devel_df)
NCBI_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(NCBI_train_data))

sentences, labels = tokens_to_sentences(NCBI_test_df)
NCBI_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(NCBI_test_data))


Number of train sentences:  957
Number of train sentences:  1042


In [None]:
sentences, labels = tokens_to_sentences(JNLPBA_devel_df)
JNLPBA_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(JNLPBA_train_data))

sentences, labels = tokens_to_sentences(JNLPBA_test_df)
JNLPBA_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(JNLPBA_test_data))

Number of train sentences:  4166
Number of train sentences:  4288


In [None]:
sentences, labels = tokens_to_sentences(BC5CDR_D_devel_df)
BC5CDR_D_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_D_train_data))

sentences, labels = tokens_to_sentences(BC5CDR_D_test_df)
BC5CDR_D_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_D_test_data))

Number of train sentences:  5753
Number of train sentences:  6488


In [None]:
sentences, labels = tokens_to_sentences(BC5CDR_C_devel_df)
BC5CDR_C_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_C_train_data))

sentences, labels = tokens_to_sentences(BC5CDR_C_test_df)
BC5CDR_C_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC5CDR_C_test_data))

Number of train sentences:  5730
Number of train sentences:  6488


In [None]:
sentences, labels = tokens_to_sentences(BC2GM_devel_df)
BC2GM_train_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC2GM_train_data))

sentences, labels = tokens_to_sentences(BC2GM_test_df)
BC2GM_test_data = pd.DataFrame({
    'sentence': sentences,
    'word_labels': labels
    })

print('Number of train sentences: ', len(BC2GM_test_data))

Number of train sentences:  3054
Number of train sentences:  4741


Let's verify that a random sentence and its corresponding tags are correct:

In [None]:
sentence = NCBI_train_data.iloc[16].sentence
sentence

'11 times greater for BRCA1 carriers harbouring one or two rare HRAS1 alleles , compared to carriers with only common alleles ( P = 0 .'

In [None]:
NCBI_train_data.iloc[16].word_labels

'O,O,O,O,O,O,O,O,O,O,O,B,O,O,O,O,O,O,O,O,O,O,O,O,O,O'

#### **Preparing the dataset and dataloader**

Now that our data is preprocessed, we can turn it into PyTorch tensors such that we can provide it to the model. Let's start by defining some key variables that will be used later on in the training/evaluation process:

In [None]:
from transformers import AutoTokenizer

MAX_LEN = 240
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
#tokenizer = #BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

A tricky part of NER with BERT is that BERT relies on **wordpiece tokenization**, rather than word tokenization. This means that we should also define the labels at the wordpiece-level, rather than the word-level!

For example, if you have word like "Washington" which is labeled as "b-gpe", but it gets tokenized to "Wash", "##ing", "##ton", then we will have to propagate the word’s original label to all of its wordpieces: "b-gpe", "b-gpe", "b-gpe". The model should be able to produce the correct labels for each individual wordpiece. The function below (taken from [here](https://github.com/chambliss/Multilingual_NER/blob/master/python/utils/main_utils.py#L118)) implements this.






In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """
    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
max = 0

datasets = [
    NCBI_train_data,
    JNLPBA_train_data,
    BC5CDR_D_train_data,
    BC5CDR_C_train_data,
    BC2GM_train_data
    ]
for dataset in datasets:

  for i, row in dataset.iterrows():
    tokenized_sentence, labels = tokenize_and_preserve_labels(row['sentence'], row['word_labels'], tokenizer)
    if len(tokenized_sentence) != len(labels):
      print('AYOOO')
      print(dataset, i)
    if len(tokenized_sentence) > max:
      max = len(tokenized_sentence)

max

240

Note that this is a **design decision**. You could also decide to only label the first wordpiece of each word and let the model only learn this (this is what was done in the original BERT paper, see Github discussion [here](https://github.com/huggingface/transformers/issues/64#issuecomment-443703063)). Another design decision could be to give the first wordpiece of each word the original word label, and then use the label “X” for all subsequent subwords of that word.

All of them lead to good performance.

Next, we define a regular PyTorch [dataset class](https://pytorch.org/docs/stable/data.html) (which transforms examples of a dataframe to PyTorch tensors). Here, each sentence gets tokenized, the special tokens that BERT expects are added, the tokens are padded or truncated based on the max length of the model, the attention mask is created and the labels are created based on the dictionary which we defined above.

For more information about BERT's inputs, see [here](https://huggingface.co/transformers/glossary.html).  

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

Now, based on the class we defined above, we can create 2 datasets, one for training and one for testing. Let's use a 80/20 split:

In [None]:
dataset_names = [
    'NCBI',
    'JNLPBA',
    'BC5CDR_D',
    'BC5CDR_C',
    'BC2GM'
    ]

train_datasets = [
    NCBI_train_data,
    JNLPBA_train_data,
    BC5CDR_D_train_data,
    BC5CDR_C_train_data,
    BC2GM_train_data
    ]

test_datasets = [
    NCBI_test_data,
    JNLPBA_test_data,
    BC5CDR_D_test_data,
    BC5CDR_C_test_data,
    BC2GM_test_data
    ]

datasets = {}
for i in range(1):
  datasets[dataset_names[i]] = [dataset(train_datasets[i], tokenizer, MAX_LEN), dataset(test_datasets[i], tokenizer, MAX_LEN)]

for name, sets in datasets.items():
  print(f"{name} TRAIN Dataset: {sets[0].len}")
  print(f"{name} TEST Dataset: {sets[1].len}")
  print()

BC5CDR_C TRAIN Dataset: 5730
BC5CDR_C TEST Dataset: 6488



Let's have a look at the first training example:

In [None]:
datasets['BC5CDR_C'][0][1]

{'ids': tensor([    2,  2645,    29, 26402,  2543, 15048,  2727,  2182,  1685, 10872,
          4609,    11, 10685,    12,  1682,  1808,  1715,  3296,  3624,  2027,
          2029,  2874, 21718,    15,  2574, 21803,  1690,    18,  1781,  4096,
          6082,  1685,  2951,  9037,    15,  1950,  2056,  2840,  1701,  1800,
          3116,  2951,  2174,    17,     3,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

Let's verify that the input ids and corresponding targets are correct:

In [None]:
datasets['BC5CDR_C'][0][1]["ids"]

tensor([    2,  2645,    29, 26402,  2543, 15048,  2727,  2182,  1685, 10872,
         4609,    11, 10685,    12,  1682,  1808,  1715,  3296,  3624,  2027,
         2029,  2874, 21718,    15,  2574, 21803,  1690,    18,  1781,  4096,
         6082,  1685,  2951,  9037,    15,  1950,  2056,  2840,  1701,  1800,
         3116,  2951,  2174,    17,     3,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(datasets['BC5CDR_C'][0][1]["ids"][:30]), datasets['BC5CDR_C'][0][1]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       0
background  0
:           0
calcitriol  B
therapy     0
suppresses  0
serum       0
levels      0
of          0
parathyroid  0
hormone     0
(           0
pth         0
)           0
in          0
patients    0
with        0
renal       0
failure     0
but         0
has         0
several     0
drawbacks   0
,           0
including   0
hypercalcemia  0
and         0
/           0
or          0
marked      0


Now, let's define the corresponding PyTorch dataloaders:

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [None]:
dataset_loaders = {}

for key, sets in datasets.items():
  dataset_loaders[key] = [DataLoader(sets[0], **train_params), DataLoader(sets[1], **train_params)]

In [None]:
dataset_loaders

{'BC5CDR_C': [<torch.utils.data.dataloader.DataLoader at 0x7fc8d7fc6950>,
  <torch.utils.data.dataloader.DataLoader at 0x7fc8d7fc4b50>]}

#### **Training the model**

Before training the model, let's perform a sanity check, which I learned thanks to Andrej Karpathy's wonderful [cs231n course](http://cs231n.stanford.edu/) at Stanford (see also his [blog post about debugging neural networks](http://karpathy.github.io/2019/04/25/recipe/)). The initial loss of your model should be close to -ln(1/number of classes) = -ln(1/3) = 1.10.

Why? Because we are using cross entropy loss. The cross entropy loss is defined as -ln(probability score of the model for the correct class). In the beginning, the weights are random, so the probability distribution for all of the classes for a given token will be uniform, meaning that the probability for the correct class will be near 1/3. The loss for a given token will thus be -ln(1/3).

Let's verify this:



In [None]:
# Defining the training function for tuning the bert model
def train(epoch, training_loader):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

#### **Evaluating the model**

Now that we've trained our model, we can evaluate its performance on the held-out test set. Note that here, no gradient updates are performed, the model just outputs its logits.

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

The accuracy metric is misleading, as a lot of labels are "outside" (O), even after omitting predictions on the [PAD] tokens. What is important is looking at the precision, recall and f1-score of the individual tags. For this, we use the seqeval Python library:

# FINE-TUNING PIPELINE

In this pipeline, we load in the model to the GPU, train it on the training datasets, evaluate it on the validation set and provide a confusion matrix report, and then push the fine-tuned model to our [HuggingFace page](https://huggingface.co/68611-llm-annotation-group).

1. Define the model
Here we define the model, either BioGPT or BioLINKBert-base, and load it with the pretrained weights of "bert-base-uncased". The only thing we need to additionally specify is the number of labels (as this will determine the architecture of the classification head). Note that only the base layers are initialized with the pretrained weights. The token classification head of top has just randomly initialized weights, which we will train, together with the pretrained weights, using our labelled dataset.Then, we move the model to the GPU.

2. Train the model

3. Evaluate the model

In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from seqeval.metrics import classification_report

In [None]:
for key, loaders in dataset_loaders.items():

  print(f'TRAINING ON {key}.... \n')

  training_loader = loaders[0]
  testing_loader = loaders[1]

  tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
  model = AutoModelForTokenClassification.from_pretrained('michiyasunaga/BioLinkBERT-base',
                                    num_labels=len(id2label),
                                    id2label=id2label,
                                    label2id=label2id)
  model.to(device)
  optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

  for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch, training_loader)

  print(f'EVALUATING {key}.... \n')
  labels, predictions = valid(model, testing_loader)
  print()
  print('-----------------------------------------------------------------')
  print(classification_report([labels], [predictions]))
  print('-----------------------------------------------------------------')
  print()
  print(f'WRITING {key} to HUGGING FACE.... \n')

  model_name = f"BioLinkBERT-base-NER-{key}-FS"

  # upload files to the hub
  tokenizer.push_to_hub(
      organization='68611-llm-annotation-group',
      repo_id=model_name,
      commit_message="Add tokenizer",
      use_temp_dir=True,
  )
  model.push_to_hub(
      organization='68611-llm-annotation-group',
      repo_id=model_name,
      commit_message="Add model",
      use_temp_dir=True,
  )
  print(f'{key} DONE!')
  print('*******************************************************')
  print()


TRAINING ON BC5CDR_C.... 



config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1
Training loss per 100 training steps: 0.9395022988319397
Training loss per 100 training steps: 0.14430123698519598
Training loss per 100 training steps: 0.09267221175745796
Training loss per 100 training steps: 0.07163017439931334
Training loss per 100 training steps: 0.05953716117589086
Training loss per 100 training steps: 0.05254205404232422
Training loss per 100 training steps: 0.047153706933928036
Training loss per 100 training steps: 0.043184781132847835
Training loss per 100 training steps: 0.03994780624507416
Training loss per 100 training steps: 0.03744988606508292
Training loss per 100 training steps: 0.035439834028103766
Training loss per 100 training steps: 0.033699764329902016
Training loss per 100 training steps: 0.03240250883347508
Training loss per 100 training steps: 0.031101524086005493
Training loss per 100 training steps: 0.030143560938152226
Training loss epoch: 0.029843116915295394
Training accuracy epoch: 0.9209303096644981
Training epoch: 2
Tra



              precision    recall  f1-score   support

           _       0.63      0.91      0.75      2968

   micro avg       0.63      0.91      0.75      2968
   macro avg       0.63      0.91      0.75      2968
weighted avg       0.63      0.91      0.75      2968

-----------------------------------------------------------------

WRITING BC5CDR_C to HUGGING FACE.... 





model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

BC5CDR_C DONE!
*******************************************************

