In [1]:
import re
import torch
import json

from seqeval.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score
from seqeval.scheme import BILOU
from tokenizers import decoders, Encoding
from transformers import BertForTokenClassification, BertTokenizer, BertTokenizerFast, BatchEncoding

from ner_pipeline.alignment import align_tokens_and_annotations_bilou
from ner_pipeline.scrape_for_training import prepare_data
from ner_pipeline.labelset import LabelSet

In [2]:
model = BertForTokenClassification.from_pretrained("BERT/bert_ner_finetuned_iliad-with-gpu-pattern2.model")

In [3]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [4]:
label_list = ['O', 'B-CLEntity', 'I-CLEntity', 'L-CLEntity', 'U-CLEntity']

In [5]:
odyssey_lines = []
odyssey = open("example-texts/odyssey.txt")
for line in odyssey:
    line = line.strip()
    odyssey_lines.append(line)

In [6]:
odyssey_lines = [line for line in odyssey_lines if line]

In [7]:
odyssey_lines[:5]

['The man for wisdom’s various arts renown’d,',
 'Long exercised in woes, O Muse! resound;',
 'Who, when his arms had wrought the destined fall',
 'Of sacred Troy, and razed her heaven-built wall,',
 'Wandering from clime to clime, observant stray’d,']

In [9]:
pred = []
for line in odyssey_lines:
    tokens_line = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(line)))
    print(tokens_line)
    inputs_line = tokenizer.encode(line, return_tensors="pt")
    print(inputs_line)
    
    outputs_line = model(inputs_line).logits
    predictions_line = torch.argmax(outputs_line, dim=2)
    print(predictions_line)
    
    pred_line_label = []
    for prediction in predictions_line[0].numpy():
        pred_line_label.append(label_list[prediction])
    pred.append(pred_line_label)
    print(pred_line_label)
    break
# with open('odyssey_ner_predictions.txt', 'w') as f:
#     f.write(json.dumps(pred))

['[CLS]', 'The', 'man', 'for', 'wisdom', '’', 's', 'various', 'arts', 're', '##no', '##wn', '’', 'd', ',', '[SEP]']
tensor([[  101,  1109,  1299,  1111, 12304,   787,   188,  1672,  3959,  1231,
          2728,  6540,   787,   173,   117,   102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [11]:
with open('odyssey_ner_predictions.txt', 'r') as f:
    pred = json.loads(f.read())

In [13]:
# Evaluation 
pattern = r'(\b[A-Z][a-z]+\b)(\s\b[A-Z][a-z]+\b)*'
odyssey_regex_matches = prepare_data("example-texts/odyssey.txt", pattern)

In [14]:
odyssey_regex_matches[:5] 

[{'content': 'The man for wisdom’s various arts renown’d,',
  'annotations': [{'start': 0, 'end': 3, 'label': 'CLEntity'}]},
 {'content': 'Long exercised in woes, O Muse! resound;',
  'annotations': [{'start': 0, 'end': 4, 'label': 'CLEntity'},
   {'start': 26, 'end': 30, 'label': 'CLEntity'}]},
 {'content': 'Who, when his arms had wrought the destined fall',
  'annotations': [{'start': 0, 'end': 3, 'label': 'CLEntity'}]},
 {'content': 'Of sacred Troy, and razed her heaven-built wall,',
  'annotations': [{'start': 0, 'end': 2, 'label': 'CLEntity'},
   {'start': 10, 'end': 14, 'label': 'CLEntity'}]},
 {'content': 'Wandering from clime to clime, observant stray’d,',
  'annotations': [{'start': 0, 'end': 9, 'label': 'CLEntity'}]}]

In [15]:
clentity_label_set = LabelSet(labels=["CLEntity"])

In [11]:
true = []
for match in odyssey_regex_matches:
    
    match_tokenized_batch : BatchEncoding = tokenizer(match["content"])
    match_tokenized_text : Encoding = match_tokenized_batch[0]
    aligned_label_ids = clentity_label_set.get_aligned_label_ids_from_annotations(
        match_tokenized_text, match["annotations"]
    )
    match_tokens = match_tokenized_text.tokens
    
    true_line_label = []
    for match_id in aligned_label_ids:
        true_line_label.append(label_list[match_id])
    true.append(true_line_label)

NameError: name 'odyssey_regex_matches' is not defined

In [12]:
print(len(true))
print(len(pred))

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
0
1


In [18]:
print(classification_report(true, pred, mode='strict', scheme=BILOU))

              precision    recall  f1-score   support

    CLEntity       0.86      0.25      0.39     15928

   micro avg       0.86      0.25      0.39     15928
   macro avg       0.86      0.25      0.39     15928
weighted avg       0.86      0.25      0.39     15928

