In [7]:
import re
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from transformers import BertForTokenClassification, BertTokenizer, BertTokenizerFast

from tokenizers import decoders

In [8]:
model = BertForTokenClassification.from_pretrained("bert_ner_finetuned_iliad-with-gpu-pattern2.model")

In [9]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [10]:
label_list = ['O', 'B-CLEntity', 'I-CLEntity', 'L-CLEntity', 'U-CLEntity']

In [11]:
sequence = """The man for wisdom’s various arts renown’d,
Long exercised in woes, O Muse! resound;
Who, when his arms had wrought the destined fall
Of sacred Troy, and razed her heaven-built wall,
Wandering from clime to clime, observant stray’d,
Their manners noted, and their states survey’d,
On stormy seas unnumber’d toils he bore,
Safe with his friends to gain his natal shore:
Vain toils! their impious folly dared to prey
On herds devoted to the god of day;
The god vindictive doom’d them never more
(Ah, men unbless’d!) to touch that natal shore.
Oh, snatch some portion of these acts from fate,
that's Celestial Muse! and to our world relate.
"""

In [12]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))

In [15]:
tokens[:10]

['[CLS]', 'The', 'man', 'for', 'wisdom', '’', 's', 'various', 'arts', 're']

In [16]:
inputs = tokenizer.encode(sequence, return_tensors="pt")

In [18]:
inputs[0][:10]

tensor([  101,  1109,  1299,  1111, 12304,   787,   188,  1672,  3959,  1231])

In [19]:
outputs = model(inputs).logits

In [26]:
outputs[0][:10]

tensor([[ 8.9492, -2.2200, -2.9792, -2.1944, -2.7990],
        [-1.0060,  0.3633, -3.5774, -1.9558,  6.7369],
        [ 8.5389, -2.9141, -3.1272, -1.7843, -2.3615],
        [ 9.0322, -2.8466, -2.8821, -2.0914, -2.9410],
        [ 9.0151, -2.5689, -2.9633, -2.2023, -2.8106],
        [ 9.2213, -2.5399, -2.8564, -2.3591, -2.9243],
        [ 9.4320, -2.7111, -2.9689, -2.3696, -3.0193],
        [ 9.2810, -2.6317, -2.8026, -2.3994, -2.9145],
        [ 9.2640, -2.6496, -2.7412, -2.2737, -2.9194],
        [ 9.3333, -2.7478, -2.7948, -2.3169, -3.1223]],
       grad_fn=<SliceBackward>)

In [23]:
predictions = torch.argmax(outputs, dim=2)

In [25]:
predictions[0][:10]

tensor([0, 4, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
len(tokens)

In [27]:
for token, prediction in zip(tokens, predictions[0].numpy()):
     print((token, model.config.id2label[prediction]))

('[CLS]', 'LABEL_0')
('The', 'LABEL_4')
('man', 'LABEL_0')
('for', 'LABEL_0')
('wisdom', 'LABEL_0')
('’', 'LABEL_0')
('s', 'LABEL_0')
('various', 'LABEL_0')
('arts', 'LABEL_0')
('re', 'LABEL_0')
('##no', 'LABEL_0')
('##wn', 'LABEL_0')
('’', 'LABEL_0')
('d', 'LABEL_0')
(',', 'LABEL_0')
('Long', 'LABEL_4')
('exercised', 'LABEL_0')
('in', 'LABEL_0')
('w', 'LABEL_0')
('##oes', 'LABEL_0')
(',', 'LABEL_0')
('O', 'LABEL_0')
('Muse', 'LABEL_4')
('!', 'LABEL_0')
('re', 'LABEL_0')
('##sound', 'LABEL_0')
(';', 'LABEL_0')
('Who', 'LABEL_4')
(',', 'LABEL_0')
('when', 'LABEL_0')
('his', 'LABEL_0')
('arms', 'LABEL_0')
('had', 'LABEL_0')
('wrought', 'LABEL_0')
('the', 'LABEL_0')
('destined', 'LABEL_0')
('fall', 'LABEL_0')
('Of', 'LABEL_4')
('sacred', 'LABEL_0')
('Troy', 'LABEL_4')
(',', 'LABEL_0')
('and', 'LABEL_0')
('r', 'LABEL_0')
('##azed', 'LABEL_0')
('her', 'LABEL_0')
('heaven', 'LABEL_0')
('-', 'LABEL_0')
('built', 'LABEL_0')
('wall', 'LABEL_0')
(',', 'LABEL_0')
('Wan', 'LABEL_1')
('##dering', '

In [31]:
pred = []
pred_line_label = []
for prediction in predictions[0].numpy():
    pred_line_label.append(model.config.id2label[prediction])
pred.append(pred_line_label)
print(pred[0])

['LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_3', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0'

In [32]:
#pattern = r' ([A-Z].[a-z]+)'
pattern = r'(\b[A-Z][a-z]+\b)(\s\b[A-Z][a-z]+\b)*'
re.compile(pattern)

def get_annotations(text, pattern):
    annotations = []
    for match in re.finditer(pattern, text):
        label_dic = dict()
        label_dic['start'] = match.start()
        label_dic['end'] = match.end()
        label_dic['label'] = 'CLEntity' # Entity starting with a capital letter
        annotations.append(label_dic)
    return annotations

In [33]:
ex_annotations = get_annotations(sequence, pattern)

In [34]:
# try an exmaple
from transformers import BertTokenizerFast, BatchEncoding
from tokenizers import Encoding
from alignment import align_tokens_and_annotations_bilou

example = {'content': sequence, 'annotations': ex_annotations}
tokenizer_ex = BertTokenizerFast.from_pretrained('bert-base-cased') # Load a pre-trained tokenizer
tokenized_batch_ex : BatchEncoding = tokenizer_ex(example["content"])
tokenized_text : Encoding = tokenized_batch_ex[0]
labels = align_tokens_and_annotations_bilou(tokenized_text, example["annotations"])


In [35]:
from labelset import LabelSet

ex_label_set = LabelSet(labels=["CLEntity"])
aligned_label_ids = ex_label_set.get_aligned_label_ids_from_annotations(
    tokenized_text, example["annotations"]
)
tokens = tokenized_text.tokens

for token, label in zip(tokens, aligned_label_ids):
    print(token, "-", label)

[CLS] - 0
The - 4
man - 0
for - 0
wisdom - 0
’ - 0
s - 0
various - 0
arts - 0
re - 0
##no - 0
##wn - 0
’ - 0
d - 0
, - 0
Long - 4
exercised - 0
in - 0
w - 0
##oes - 0
, - 0
O - 0
Muse - 4
! - 0
re - 0
##sound - 0
; - 0
Who - 4
, - 0
when - 0
his - 0
arms - 0
had - 0
wrought - 0
the - 0
destined - 0
fall - 0
Of - 4
sacred - 0
Troy - 4
, - 0
and - 0
r - 0
##azed - 0
her - 0
heaven - 0
- - 0
built - 0
wall - 0
, - 0
Wan - 1
##dering - 3
from - 0
c - 0
##lim - 0
##e - 0
to - 0
c - 0
##lim - 0
##e - 0
, - 0
o - 0
##bs - 0
##er - 0
##vant - 0
stray - 0
’ - 0
d - 0
, - 0
Their - 4
manners - 0
noted - 0
, - 0
and - 0
their - 0
states - 0
survey - 0
’ - 0
d - 0
, - 0
On - 4
storm - 0
##y - 0
seas - 0
un - 0
##num - 0
##ber - 0
’ - 0
d - 0
to - 0
##ils - 0
he - 0
bore - 0
, - 0
Safe - 4
with - 0
his - 0
friends - 0
to - 0
gain - 0
his - 0
na - 0
##tal - 0
shore - 0
: - 0
V - 1
##ain - 3
to - 0
##ils - 0
! - 0
their - 0
imp - 0
##ious - 0
f - 0
##olly - 0
dared - 0
to - 0
prey - 0
On - 4
herd - 0

In [36]:
true = []
true_line_label = []
for label in aligned_label_ids:
    true_line_label.append('LABEL_'+str(label))
true.append(true_line_label)
print(true[0][:10])

['LABEL_0', 'LABEL_4', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0']


In [37]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

classification_report(true, pred)

ModuleNotFoundError: No module named 'seqeval'

In [None]:
## https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
# tokenizer.decoder = decoders.WordPiece()
# tokenizer.decode(outputs.ids)

In [None]:
odyssey_lines = []
book = open("../example-texts/odyssey.txt")
for line in book:
    line = line.strip()
    odyssey_lines.append(line)

In [None]:
odyssey_lines = [line for line in odyssey_lines if line]

In [None]:
odyssey_lines[:10]

In [None]:
## https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
# tokenizer.decoder = decoders.WordPiece()
# tokenizer.decode(outputs.ids)

In [None]:
for line in odyssey_lines:
    tokens_line = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(line)))
    inputs_line = tokenizer.encode(line, return_tensors="pt")

    outputs_line = model(inputs_line).logits
    predictions_line = torch.argmax(outputs_line, dim=2)

In [None]:
json_data = []
book = open("../example-texts/odyssey.txt")
for line in book:
    line = line.strip()
    
    line_data = dict()
    line_data['content'] = line
    line_data['annotations'] = get_annotations(line, pattern)
    json_data.append(line_data)

In [None]:
# For testing the result. Need a way to test whether tuned bert is accurately identifying CLEntities
json_data[:10] 

In [None]:
from transformers import BertTokenizerFast
