# HuggingFace Transformers on SciBERT (NER)

In [1]:
import transformers
import pandas as pd
import os
import numpy as np
import torch
import pandas as pd
import csv
import re
from collections import OrderedDict

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
# model = transformers.AutoModelForTokenClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=len(label_list))

# 1. Data Pre-Processing

An overview of the data pre-processing pipeline:
1. txt files (train, test, dev)
2. primary data (X_primary) =  dict of (id: list), (token: list), and (ner_tag: list) for every sentence; grouped by key (not sentence).
3. tokenized data (X_tokenized) = 'dict' of token_id, label, and attention_mask; grouped by key.
4. Dataset objects (X_data) = a 'list' of token_id, label, and attention mask; grouped by sentence!

In [3]:
# Input/Output Args
DATA_DIR: str = "./data/ner_chemprot/"
DATA_FILES: dict = {
    "train": DATA_DIR + 'train.txt', 
    "test": DATA_DIR + 'test.txt', 
    "val": DATA_DIR + 'dev.txt'
}
label_list = ['O',
          'B-enzyme',
          'B-SUBSTRATE',
          'I-SUBSTRATE',
          'B-PRODUCT-OF',
          'I-enzyme',
          'I-PRODUCT-OF'
         ]

## 1.1-2 Loading the chemprot data from SciBERT into Primary Data


In [4]:
def txt2primary(fname) -> OrderedDict:
    # initialize primary data dict
    primary_data = OrderedDict()
    primary_data['id'] =  []
    primary_data['tokens'] = []
    primary_data['ner_tags'] = []
    
#     fname = DATA_DIR + fi #'head.txt' # to test with 2 sentences only.
    
    sentence_id = 0
    with open(fname, "r") as f:
        rd = csv.reader(f, delimiter='\t')
        
        is_blank_after_docstart = False
        tmp_words = []
        tmp_ners = []
        for row in rd:
            if is_blank_after_docstart:
                is_blank_after_docstart = False
                continue
            elif not row:
                continue
            elif re.findall('DOCSTART', row[0]):
                is_blank_after_docstart = True
                continue
            elif row[0] == '.' and row[1] == '.': # currently doesn't include periods.

                primary_data['id'].extend([sentence_id])
                primary_data['tokens'].extend([tmp_words])
                primary_data['ner_tags'].extend([tmp_ners])

                
                sentence_id += 1
                tmp_words = []
                tmp_ners = []
                continue
                
            tmp_words += [row[0]]
            tmp_ners += [label_list.index(row[3])]
    return primary_data

In [5]:
train_primary: OrderedDict = txt2primary(DATA_FILES['train'])
val_primary: OrderedDict = txt2primary(DATA_FILES['val'])
test_primary: OrderedDict = txt2primary(DATA_FILES['test'])

In [6]:
def get_entry(i, primary_data: OrderedDict) -> dict:
    out = {
        'id': primary_data['id'][i],
        'tokens': primary_data['tokens'][i],
        'ner_tags': primary_data['ner_tags'][i]
    }
    return out

example = get_entry(0, train_primary)
example

{'id': 0,
 'tokens': ['The',
  'enzyme',
  'cyclo-oxygenase',
  'catalyses',
  'the',
  'oxygenation',
  'of',
  'arachidonic',
  'acid',
  ',',
  'leading',
  'to',
  'the',
  'formation',
  'of',
  'prostaglandins'],
 'ner_tags': [0, 0, 1, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 4]}

## 1.2-3 Tokenize the Primary Data

We then tokenize the primary data to get their encodings, and create a Dataset object.

### Verifying the tokenizer, based on the reference notebook

In [7]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'the', 'enzyme', 'cyclo', '-', 'oxygen', '##ase', 'cataly', '##ses', 'the', 'oxygenation', 'of', 'arachid', '##onic', 'acid', ',', 'leading', 'to', 'the', 'formation', 'of', 'prostaglandin', '##s', '[SEP]']


In [8]:
len(example[f"ner_tags"]), len(tokenized_input["input_ids"])

(16, 24)

In [9]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"ner_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

24 24


### Define and apply the tokenization function.

In [10]:
label_all_tokens = True
def tokenize_and_align_labels(primary) -> transformers.tokenization_utils_base.BatchEncoding: # basically dict
    tokenized_inputs = tokenizer(primary["tokens"], 
                                 padding=True, 
                                 truncation=True, 
                                 is_split_into_words=True,
                                 return_token_type_ids=False
                                )

    labels = []
    for i, label in enumerate(primary[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
        

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
# create tokenized inputs
train_tokenized = tokenize_and_align_labels(train_primary)
test_tokenized = tokenize_and_align_labels(test_primary)
val_tokenized = tokenize_and_align_labels(val_primary)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## 1.3-4 Create Dataset Objects

In [12]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        self.labels = encodings['labels']

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = list(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
train_data = Dataset(train_tokenized)
test_data = Dataset(test_tokenized)
val_data = Dataset(val_tokenized)

# 2. Fine Tuning + Training the Model

Overview of fine tuning and training:
1. Metrics function
2. Training
3. Testing

## 2.1 Metrics Function

In [14]:
import datasets
metric = datasets.load_metric("seqeval")
labels = [label_list[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'PRODUCT-OF': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'SUBSTRATE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'enzyme': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [15]:
def compute_metrics(p) -> dict:
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    return results

## 2.2 Training

Defining necessary functions as args.

In [16]:
# Model_init for hyperparameter search 
# ref: https://huggingface.co/blog/ray-tune
def model_init():
    return transformers.AutoModelForTokenClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=len(label_list))

In [23]:
training_args = transformers.TrainingArguments(
    f"test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=15,
    weight_decay=0.01,
)

trainer = transformers.Trainer(
    args=training_args,                  # training arguments, defined above
    train_dataset=train_data,         # training dataset
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForTokenClassification(tokenizer),
    eval_dataset=val_data,             # evaluation dataset
    compute_metrics=compute_metrics,
    model_init=model_init,
#     model=model
)

trainer.train()

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

Epoch,Training Loss,Validation Loss,Product-of,Substrate,Enzyme,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Runtime,Samples Per Second
1,0.2605,0.123251,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 48}","{'precision': 0.34615384615384615, 'recall': 0.11538461538461539, 'f1': 0.17307692307692307, 'number': 156}","{'precision': 0.6, 'recall': 0.05172413793103448, 'f1': 0.09523809523809525, 'number': 174}",0.397059,0.071429,0.121076,0.952054,28.4268,10.237
2,0.1208,0.181756,"{'precision': 0.7222222222222222, 'recall': 0.2708333333333333, 'f1': 0.39393939393939387, 'number': 48}","{'precision': 0.6739130434782609, 'recall': 0.1987179487179487, 'f1': 0.306930693069307, 'number': 156}","{'precision': 0.6666666666666666, 'recall': 0.2988505747126437, 'f1': 0.4126984126984128, 'number': 174}",0.676056,0.253968,0.369231,0.960374,25.6538,11.343
3,0.0745,0.249023,"{'precision': 0.6363636363636364, 'recall': 0.14583333333333334, 'f1': 0.23728813559322035, 'number': 48}","{'precision': 0.6923076923076923, 'recall': 0.11538461538461539, 'f1': 0.19780219780219782, 'number': 156}","{'precision': 0.66, 'recall': 0.1896551724137931, 'f1': 0.29464285714285715, 'number': 174}",0.666667,0.153439,0.249462,0.956734,24.9944,11.643
4,0.0503,0.190633,"{'precision': 0.6896551724137931, 'recall': 0.4166666666666667, 'f1': 0.5194805194805195, 'number': 48}","{'precision': 0.5916666666666667, 'recall': 0.4551282051282051, 'f1': 0.5144927536231884, 'number': 156}","{'precision': 0.6329113924050633, 'recall': 0.28735632183908044, 'f1': 0.3952569169960474, 'number': 174}",0.618421,0.373016,0.465347,0.961726,25.5699,11.381
5,0.0177,0.22395,"{'precision': 0.7333333333333333, 'recall': 0.4583333333333333, 'f1': 0.5641025641025641, 'number': 48}","{'precision': 0.363013698630137, 'recall': 0.33974358974358976, 'f1': 0.3509933774834437, 'number': 156}","{'precision': 0.46308724832214765, 'recall': 0.39655172413793105, 'f1': 0.42724458204334365, 'number': 174}",0.443077,0.380952,0.409673,0.954342,28.5587,10.19
6,0.0181,0.241416,"{'precision': 0.7666666666666667, 'recall': 0.4791666666666667, 'f1': 0.5897435897435898, 'number': 48}","{'precision': 0.5803571428571429, 'recall': 0.4166666666666667, 'f1': 0.4850746268656717, 'number': 156}","{'precision': 0.5754716981132075, 'recall': 0.3505747126436782, 'f1': 0.43571428571428567, 'number': 174}",0.600806,0.39418,0.476038,0.963495,30.6885,9.482
7,0.0071,0.277382,"{'precision': 0.7857142857142857, 'recall': 0.4583333333333333, 'f1': 0.5789473684210527, 'number': 48}","{'precision': 0.4215686274509804, 'recall': 0.27564102564102566, 'f1': 0.33333333333333337, 'number': 156}","{'precision': 0.6082474226804123, 'recall': 0.3390804597701149, 'f1': 0.4354243542435424, 'number': 174}",0.546256,0.328042,0.409917,0.960166,28.022,10.385
8,0.0118,0.298094,"{'precision': 0.8333333333333334, 'recall': 0.4166666666666667, 'f1': 0.5555555555555556, 'number': 48}","{'precision': 0.7391304347826086, 'recall': 0.10897435897435898, 'f1': 0.18994413407821228, 'number': 156}","{'precision': 0.6701030927835051, 'recall': 0.3735632183908046, 'f1': 0.4797047970479705, 'number': 174}",0.708333,0.269841,0.390805,0.96079,27.0352,10.764
9,0.0034,0.24934,"{'precision': 0.48, 'recall': 0.5, 'f1': 0.4897959183673469, 'number': 48}","{'precision': 0.625, 'recall': 0.5128205128205128, 'f1': 0.5633802816901409, 'number': 156}","{'precision': 0.5029585798816568, 'recall': 0.4885057471264368, 'f1': 0.4956268221574344, 'number': 174}",0.544669,0.5,0.521379,0.961102,25.8943,11.238
10,0.0028,0.259408,"{'precision': 0.5102040816326531, 'recall': 0.5208333333333334, 'f1': 0.5154639175257733, 'number': 48}","{'precision': 0.36893203883495146, 'recall': 0.24358974358974358, 'f1': 0.29343629343629346, 'number': 156}","{'precision': 0.46774193548387094, 'recall': 0.5, 'f1': 0.4833333333333333, 'number': 174}",0.443787,0.396825,0.418994,0.953926,29.1033,9.999


Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 48}" of type <class 'dict'> for key "eval/PRODUCT-OF" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.34615384615384615, 'recall': 0.11538461538461539, 'f1': 0.17307692307692307, 'number': 156}" of type <class 'dict'> for key "eval/SUBSTRATE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.6, 'recall': 0.05172413793103448, 'f1': 0.09523809523809525, 'number': 174}" of type <class 'dict'> for key "eval/enzyme" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7222222222222222, 'recall': 0.2708333333333333, 'f1': 0.39393939393939387, 'number': 48}"

TrainOutput(global_step=12870, training_loss=0.03518363530269991, metrics={'train_runtime': 6885.8891, 'train_samples_per_second': 1.869, 'total_flos': 1232636236860600.0, 'epoch': 15.0, 'init_mem_cpu_alloc_delta': 695462, 'init_mem_cpu_peaked_delta': 513666, 'train_mem_cpu_alloc_delta': 1238876, 'train_mem_cpu_peaked_delta': 105189103})

### 2.3 Testing

Log your results here: https://docs.google.com/spreadsheets/d/1jolvSI9tCqHZqBMtX1MAUjht2WuXyl_uFauhbvHMUtQ/edit?usp=sharing

In [24]:
predictions = trainer.predict(test_data)

In [25]:
compute_metrics(predictions[0:2])

{'PRODUCT-OF': {'precision': 0.125,
  'recall': 0.125,
  'f1': 0.125,
  'number': 32},
 'SUBSTRATE': {'precision': 0.7261904761904762,
  'recall': 0.5,
  'f1': 0.5922330097087378,
  'number': 122},
 'enzyme': {'precision': 0.6054421768707483,
  'recall': 0.4564102564102564,
  'f1': 0.52046783625731,
  'number': 195},
 'overall_precision': 0.5855513307984791,
 'overall_recall': 0.44126074498567336,
 'overall_f1': 0.5032679738562091,
 'overall_accuracy': 0.9629198008263588}

### 2.4 Hyperparameter search??

In [None]:
#trainer.hyperparameter_search(direction="maximize")

In [None]:
# from torch.utils.data import DataLoader

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# model.train()

# torch.manual_seed(10)
# BATCH_SIZE = 64
# train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

# optim = transformers.AdamW(model.parameters(), lr=5e-5)

# for epoch in range(3):
#     for i, batch in enumerate(train_loader):
#         print(f'Doing epoch {epoch}, entries {i*BATCH_SIZE} to {(i+1)*BATCH_SIZE} out of {len(train_loader)}')
#         optim.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs[0]
#         loss.backward()
#         optim.step()

# model.eval()