In [2]:
from typing_extensions import TypedDict
from typing import List,Any
IntList = List[int] # A list of token_ids
IntListList = List[IntList] # A List of List of token_ids, e.g. a Batch

In [20]:
import re
pattern = r'[A-Z][A-Za-z]*'
re.compile(pattern)

text = """and the early
inscriptions are rude and unskilfully executed; nor can we even assure
ourselves whether Archilochus, Simonides of Amorgus, Kallinus,
Tyrtaeus, Xanthus, and the other early elegiac and lyric poets,
committed their compositions to writing, or at what time the practice
of doing so became familiar."""

annotations = []
for match in re.finditer(pattern, text):
    label_dic = dict()
    label_dic['start'] = match.start()
    label_dic['end'] = match.end()
    label_dic['text'] = text[match.start():match.end()]
    label_dic['label'] = 'CL-Entity' # Entity starting with a capital letter
    annotations.append(label_dic)
print(annotations)


[{'start': 103, 'end': 114, 'text': 'Archilochus', 'label': 'CL-Entity'}, {'start': 116, 'end': 125, 'text': 'Simonides', 'label': 'CL-Entity'}, {'start': 129, 'end': 136, 'text': 'Amorgus', 'label': 'CL-Entity'}, {'start': 138, 'end': 146, 'text': 'Kallinus', 'label': 'CL-Entity'}, {'start': 148, 'end': 156, 'text': 'Tyrtaeus', 'label': 'CL-Entity'}, {'start': 158, 'end': 165, 'text': 'Xanthus', 'label': 'CL-Entity'}]


In [21]:
def align_tokens_and_annotations_bilou(tokenized: Encoding, annotations):
    tokens = tokenized.tokens
    aligned_labels = ["O"] * len(
        tokens
    )  # Make a list to store our labels the same length as our tokens
    for anno in annotations:
        annotation_token_ix_set = (
            set()
        )  # A set that stores the token indices of the annotation
        for char_ix in range(anno["start"], anno["end"]):

            token_ix = tokenized.char_to_token(char_ix)
            if token_ix is not None:
                annotation_token_ix_set.add(token_ix)
        if len(annotation_token_ix_set) == 1:
            # If there is only one token
            token_ix = annotation_token_ix_set.pop()
            prefix = (
                "U"  # This annotation spans one token so is prefixed with U for unique
            )
            aligned_labels[token_ix] = f"{prefix}-{anno['label']}"

        else:

            last_token_in_anno_ix = len(annotation_token_ix_set) - 1
            for num, token_ix in enumerate(sorted(annotation_token_ix_set)):
                if num == 0:
                    prefix = "B"
                elif num == last_token_in_anno_ix:
                    prefix = "L"  # Its the last token
                else:
                    prefix = "I"  # We're inside of a multi token annotation
                aligned_labels[token_ix] = f"{prefix}-{anno['label']}"
    return aligned_labels

In [22]:
from transformers import BertTokenizerFast,  BatchEncoding
from tokenizers import Encoding
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased') # Load a pre-trained tokenizer
tokenized_batch : BatchEncoding = tokenizer(text)
tokenized_text : Encoding = tokenized_batch[0]
labels = align_tokens_and_annotations_bilou(tokenized_text, annotations)
for token, label in zip(tokenized_text.tokens, labels):
    print(token, "-", label)

[CLS] - O
and - O
the - O
early - O
inscriptions - O
are - O
rude - O
and - O
un - O
##ski - O
##lf - O
##ully - O
executed - O
; - O
nor - O
can - O
we - O
even - O
assure - O
ourselves - O
whether - O
Arch - B-CL-Entity
##ilo - I-CL-Entity
##chus - L-CL-Entity
, - O
Simon - B-CL-Entity
##ides - L-CL-Entity
of - O
Amor - B-CL-Entity
##gus - L-CL-Entity
, - O
Ka - B-CL-Entity
##llin - I-CL-Entity
##us - L-CL-Entity
, - O
Ty - B-CL-Entity
##rta - I-CL-Entity
##eus - L-CL-Entity
, - O
X - B-CL-Entity
##ant - I-CL-Entity
##hus - L-CL-Entity
, - O
and - O
the - O
other - O
early - O
el - O
##eg - O
##iac - O
and - O
lyric - O
poets - O
, - O
committed - O
their - O
compositions - O
to - O
writing - O
, - O
or - O
at - O
what - O
time - O
the - O
practice - O
of - O
doing - O
so - O
became - O
familiar - O
. - O
[SEP] - O
