In [1]:
import re
import os
import sys
sys.path.append('../')

In [2]:
from transformers.tokenization_bert import BertTokenizer

In [3]:
from mlpack.datasets.biocreative import read_examples, convert_examples_to_spanfeatures, LABELS

In [4]:
from mlpack.bert.ner.dataset import InputFeaturesCollectionExtended
from mlpack.bert.ner.model import BertForSpanNERClassification, BertForNERClassification
from mlpack.bert.ner.utils import to_device

# Examples

In [5]:
folderpath = '../datasets/GNormPlus/GNormPlusCorpus/'

In [6]:
ds_files = {
    'train': 'BC2GNtrain.PubTator.txt',
    'dev': 'BC2GNtest.PubTator.txt',
}

In [7]:
examples_train = read_examples(os.path.join(folderpath, ds_files['train']))

In [8]:
example = examples_train[0]

In [9]:
for tok, label in zip(example.doc_tokens, example.labels):
    print(f'{tok:15} {label}')

A               O
specific        O
human           O
lysophospholipase B-FamilyName
:               O
cDNA            O
cloning         O
,               O
tissue          O
distribution    O
and             O
kinetic         O
characterization O
.               O
Lysophospholipases B-FamilyName
are             O
critical        O
enzymes         O
that            O
act             O
on              O
biological      O
membranes       O
to              O
regulate        O
the             O
multifunctional O
lysophospholipids O
;               O
increased       O
levels          O
of              O
lysophospholipids O
are             O
associated      O
with            O
a               O
host            O
of              O
diseases        O
.               O
Herein          O
we              O
report          O
the             O
cDNA            O
cloning         O
of              O
a               O
human           O
brain           O
25              O
kDa             O
lysophospholip

# Features

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
features_train = convert_examples_to_spanfeatures(examples_train, LABELS, 512, tokenizer, 256, True)

In [12]:
features_train[0]

[<mlpack.datasets.biocreative.InputSpanFeatures at 0x7fc2e5890e10>]

In [13]:
feat_collection = InputFeaturesCollectionExtended(features_train[0])

In [14]:
input_ids, input_masks, label_ids, label_mask, max_context = feat_collection.batch

In [15]:
input_ids, input_masks, label_ids, label_mask, max_context = to_device(
    input_ids, input_masks, label_ids, label_mask, max_context)

# Model

In [16]:
NER_LABELS = [
    l for l in LABELS if l not in ["[PAD]", "[CLS]", "[SEP]", "X"]
]

In [17]:
model = BertForSpanNERClassification.from_pretrained('bert-base-uncased',
                                                    num_labels=len(NER_LABELS),
                                                    output_hidden_states=True,
                                                    pooler='concat')

In [28]:
out = model(input_ids, input_masks, label_ids, label_mask, max_context)

In [29]:
out

(tensor(1.9673, grad_fn=<NllLossBackward>),
 tensor([[-0.7097, -0.2071,  0.8138,  ...,  0.3340,  0.4709, -0.3029],
         [-0.6222,  0.2292,  0.5387,  ..., -0.0971,  0.9337,  0.2070],
         [-0.8403,  0.5457,  0.5237,  ...,  0.3539, -0.0242, -0.0825],
         ...,
         [-0.2167,  0.0692,  0.6263,  ..., -0.2732,  0.7643, -1.1343],
         [-0.2137, -0.3081, -0.3732,  ...,  0.6649,  0.5804, -0.2463],
         [ 0.2744, -0.0453, -0.2288,  ...,  0.0409,  0.2530, -0.0031]],
        grad_fn=<IndexBackward>),
 tensor([1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 2, 1, 3, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1