In [1]:
import re
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

from tokenizers import decoders

In [2]:
model = AutoModelForTokenClassification.from_pretrained("bert_ner_finetuned_iliad-with-gpu.model")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [4]:
label_list = ['O', 'B-CLEntity', 'I-CLEntity', 'L-CLEntity', 'U-CLEntity']

In [5]:
sequence = """The man for wisdom’s various arts renown’d,
Long exercised in woes, O Muse! resound;
Who, when his arms had wrought the destined fall
Of sacred Troy, and razed her heaven-built wall,
Wandering from clime to clime, observant stray’d,
Their manners noted, and their states survey’d,
On stormy seas unnumber’d toils he bore,
Safe with his friends to gain his natal shore:
Vain toils! their impious folly dared to prey
On herds devoted to the god of day;
The god vindictive doom’d them never more
(Ah, men unbless’d!) to touch that natal shore.
Oh, snatch some portion of these acts from fate,
that's Celestial Muse! and to our world relate.
"""

In [6]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="pt")

outputs = model(inputs).logits
predictions = torch.argmax(outputs, dim=2)

In [7]:
len(tokens)

185

In [8]:
print(tokens[:10])

['[CLS]', 'The', 'man', 'for', 'wisdom', '’', 's', 'various', 'arts', 're']


In [9]:
for token, prediction in zip(tokens, predictions[0].numpy()):
     print((token, model.config.id2label[prediction]))

('[CLS]', 'LABEL_0')
('The', 'LABEL_0')
('man', 'LABEL_0')
('for', 'LABEL_0')
('wisdom', 'LABEL_0')
('’', 'LABEL_0')
('s', 'LABEL_0')
('various', 'LABEL_0')
('arts', 'LABEL_0')
('re', 'LABEL_0')
('##no', 'LABEL_0')
('##wn', 'LABEL_0')
('’', 'LABEL_0')
('d', 'LABEL_0')
(',', 'LABEL_0')
('Long', 'LABEL_4')
('exercised', 'LABEL_0')
('in', 'LABEL_0')
('w', 'LABEL_0')
('##oes', 'LABEL_0')
(',', 'LABEL_0')
('O', 'LABEL_0')
('Muse', 'LABEL_4')
('!', 'LABEL_0')
('re', 'LABEL_0')
('##sound', 'LABEL_0')
(';', 'LABEL_0')
('Who', 'LABEL_4')
(',', 'LABEL_0')
('when', 'LABEL_0')
('his', 'LABEL_0')
('arms', 'LABEL_0')
('had', 'LABEL_0')
('wrought', 'LABEL_0')
('the', 'LABEL_0')
('destined', 'LABEL_0')
('fall', 'LABEL_0')
('Of', 'LABEL_0')
('sacred', 'LABEL_0')
('Troy', 'LABEL_4')
(',', 'LABEL_0')
('and', 'LABEL_0')
('r', 'LABEL_0')
('##azed', 'LABEL_0')
('her', 'LABEL_0')
('heaven', 'LABEL_0')
('-', 'LABEL_0')
('built', 'LABEL_0')
('wall', 'LABEL_0')
(',', 'LABEL_0')
('Wan', 'LABEL_1')
('##dering', '

In [13]:
#pattern = r' ([A-Z].[a-z]+)'
pattern = r'(\b[A-Z][a-z]+\b)(\s\b[A-Z][a-z]+\b)*'
re.compile(pattern)

def get_annotations(text, pattern):
    annotations = []
    for match in re.finditer(pattern, text):
        label_dic = dict()
        label_dic['start'] = match.start()
        label_dic['end'] = match.end()
        label_dic['label'] = 'CLEntity' # Entity starting with a capital letter
        annotations.append(label_dic)
    return annotations

In [14]:
ex_annotations = get_annotations(sequence, pattern)

In [15]:
# try an exmaple
from transformers import BertTokenizerFast, BatchEncoding
from tokenizers import Encoding
from alignment import align_tokens_and_annotations_bilou

example = {'content': sequence, 'annotations': ex_annotations}
tokenizer_ex = BertTokenizerFast.from_pretrained('bert-base-cased') # Load a pre-trained tokenizer
tokenized_batch_ex : BatchEncoding = tokenizer_ex(example["content"])
tokenized_text : Encoding = tokenized_batch_ex[0]
labels = align_tokens_and_annotations_bilou(tokenized_text, example["annotations"])

for token, label in zip(tokenized_text.tokens, labels):
    print(token, "-", label)

[CLS] - O
The - U-CLEntity
man - O
for - O
wisdom - O
’ - O
s - O
various - O
arts - O
re - O
##no - O
##wn - O
’ - O
d - O
, - O
Long - U-CLEntity
exercised - O
in - O
w - O
##oes - O
, - O
O - O
Muse - U-CLEntity
! - O
re - O
##sound - O
; - O
Who - U-CLEntity
, - O
when - O
his - O
arms - O
had - O
wrought - O
the - O
destined - O
fall - O
Of - U-CLEntity
sacred - O
Troy - U-CLEntity
, - O
and - O
r - O
##azed - O
her - O
heaven - O
- - O
built - O
wall - O
, - O
Wan - B-CLEntity
##dering - L-CLEntity
from - O
c - O
##lim - O
##e - O
to - O
c - O
##lim - O
##e - O
, - O
o - O
##bs - O
##er - O
##vant - O
stray - O
’ - O
d - O
, - O
Their - U-CLEntity
manners - O
noted - O
, - O
and - O
their - O
states - O
survey - O
’ - O
d - O
, - O
On - U-CLEntity
storm - O
##y - O
seas - O
un - O
##num - O
##ber - O
’ - O
d - O
to - O
##ils - O
he - O
bore - O
, - O
Safe - U-CLEntity
with - O
his - O
friends - O
to - O
gain - O
his - O
na - O
##tal - O
shore - O
: - O
V - B-CLEntity
##ain - L-CL

In [16]:
## https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
# tokenizer.decoder = decoders.WordPiece()
# tokenizer.decode(outputs.ids)

In [14]:
odyssey_lines = []
book = open("../example-texts/odyssey.txt")
for line in book:
    line = line.strip()
    odyssey_lines.append(line)

In [18]:
odyssey_lines = [line for line in odyssey_lines if line]

In [19]:
odyssey_lines[:10]

['The man for wisdom’s various arts renown’d,',
 'Long exercised in woes, O Muse! resound;',
 'Who, when his arms had wrought the destined fall',
 'Of sacred Troy, and razed her heaven-built wall,',
 'Wandering from clime to clime, observant stray’d,',
 'Their manners noted, and their states survey’d,',
 'On stormy seas unnumber’d toils he bore,',
 'Safe with his friends to gain his natal shore:',
 'Vain toils! their impious folly dared to prey',
 'On herds devoted to the god of day;']

In [32]:
## https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
# tokenizer.decoder = decoders.WordPiece()
# tokenizer.decode(outputs.ids)

AttributeError: can't set attribute

In [1]:
for line in odyssey_lines:
    tokens_line = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(line)))
    inputs_line = tokenizer.encode(line, return_tensors="pt")

    outputs_line = model(inputs_line).logits
    predictions_line = torch.argmax(outputs_line, dim=2)

NameError: name 'odyssey_lines' is not defined

In [40]:
json_data = []
book = open("../example-texts/odyssey.txt")
for line in book:
    line = line.strip()
    
    line_data = dict()
    line_data['content'] = line
    line_data['annotations'] = get_annotations(line, pattern)
    json_data.append(line_data)

In [42]:
# For testing the result. Need a way to test whether tuned bert is accurately identifying CLEntities
json_data[:10] 

[{'content': '', 'annotations': []},
 {'content': '', 'annotations': []},
 {'content': 'The man for wisdom’s various arts renown’d,', 'annotations': []},
 {'content': 'Long exercised in woes, O Muse! resound;',
  'annotations': [{'start': 25, 'end': 30, 'label': 'CLEntity'}]},
 {'content': 'Who, when his arms had wrought the destined fall',
  'annotations': []},
 {'content': 'Of sacred Troy, and razed her heaven-built wall,',
  'annotations': [{'start': 9, 'end': 14, 'label': 'CLEntity'}]},
 {'content': 'Wandering from clime to clime, observant stray’d,',
  'annotations': []},
 {'content': 'Their manners noted, and their states survey’d,',
  'annotations': []},
 {'content': 'On stormy seas unnumber’d toils he bore,', 'annotations': []},
 {'content': 'Safe with his friends to gain his natal shore:',
  'annotations': []}]

In [None]:
from transformers import BertTokenizerFast
