In [2]:
import pandas as pd
from datasets import load_metric
metric = load_metric("seqeval")

In [3]:
table = pd.read_csv("./eval-bert-base-cased-output.csv", sep=" ", skip_blank_lines=False)

reference_labels = table["ground_truth"]
output_labels_no_transformer = table["predicted_no_transformer"]
output_labels_with_transformer = table["predicted_with_transformer"]

In [4]:
def get_iob_metrics(ground_truth, predictions):
    split_ground_truth = []
    split_predictions = []

    current_sentence_ground_truth = []
    current_sentence_predictions = []

    for ground_truth, prediction in zip(ground_truth, predictions):
        if str(ground_truth) == "nan":
            split_ground_truth.append(current_sentence_ground_truth)
            current_sentence_ground_truth = []

            split_predictions.append(current_sentence_predictions)
            current_sentence_predictions = []

        else:
            current_sentence_ground_truth.append(ground_truth)
            current_sentence_predictions.append(prediction)

    return metric.compute(references=split_ground_truth, predictions=split_predictions)

In [5]:
metrics = get_iob_metrics(reference_labels.iloc, output_labels_with_transformer.iloc)

row = []
for key in ["overall_precision", "overall_recall", "overall_f1", "overall_accuracy"]:
    row.append(metrics[key])

for key in ["LOC", "MISC", "ORG", "PER"]:
    row.append(metrics[key]["f1"])

print("with-transformer & {:2f} & {:2f} & {:2f} & {:2f} & {:2f} & {:2f} & {:2f} & {:2f} ".format(*row))

with-transformer & 0.533477 & 0.413178 & 0.465684 & 0.887650 & 0.592362 & 0.317549 & 0.398810 & 0.460220 


In [6]:
metrics.keys()

dict_keys(['LOC', 'MISC', 'ORG', 'PER', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])

In [7]:
metrics = get_iob_metrics(reference_labels.iloc, output_labels_no_transformer.iloc)

row = []
for key in ["LOC", "MISC", "ORG", "PER"]:
    row.append(metrics[key]["f1"])

print("no-transformer & {:2f} & {:2f} & {:2f} & {:2f} ".format(*row))

no-transformer & 0.766706 & 0.681369 & 0.557279 & 0.602362 


In [11]:
from collections import Counter

In [33]:
pairs = []

for index, entry in table[(reference_labels != output_labels_no_transformer) & (reference_labels == output_labels_with_transformer)].iterrows():
    pairs.append(
        (entry["token"], 
        entry["ground_truth"],
        )
    )

counter = Counter(pairs)
counter.most_common(12)

[(('-', 'O'), 8),
 (('##o', 'I-PER'), 6),
 (('May', 'O'), 5),
 (('##n', 'I-PER'), 4),
 (('##er', 'O'), 4),
 (('##lli', 'I-PER'), 4),
 (('##P', 'O'), 3),
 (('June', 'O'), 3),
 (('##hare', 'I-PER'), 3),
 (('##s', 'O'), 3),
 (('##ec', 'I-PER'), 3),
 (('U', 'B-LOC'), 3)]

In [36]:
pairs = []

for index, entry in table[(reference_labels == output_labels_no_transformer) & (reference_labels != output_labels_with_transformer)].iterrows():
    pairs.append(
        (entry["token"], 
        entry["ground_truth"],
        )
    )

counter = Counter(pairs)
counter.most_common(12)

[(('Iraqi', 'B-MISC'), 22),
 (('.', 'I-ORG'), 13),
 (('European', 'B-MISC'), 13),
 (('W', 'B-MISC'), 13),
 (('Department', 'I-ORG'), 12),
 (('Israeli', 'B-MISC'), 12),
 (('##OR', 'I-MISC'), 12),
 (('Street', 'I-LOC'), 10),
 (('Chinese', 'B-MISC'), 10),
 (('##LD', 'I-MISC'), 10),
 (('Moscow', 'B-LOC'), 9),
 (('-', 'I-MISC'), 9)]