<a href="https://colab.research.google.com/github/jlopetegui98/NER-ClinicalTrials-Elegibility-Criteria/blob/main/Roberta%2BLLM/evaluate_roberta_chia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# uncomment if working in colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# uncomment if using colab
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U datasets
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install seqeval
!pip install -q -U evaluate

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
Collecting seqeval

In [2]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification,  Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
import evaluate
import torch

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# dict for the entities (entity to int value)
simple_ent = {"Condition", "Value", "Drug", "Procedure", "Measurement", "Temporal", "Observation", "Person", "Device"}
sel_ent = {
    "O": 0,
    "B-Condition": 1,
    "I-Condition": 2,
    "B-Value": 3,
    "I-Value": 4,
    "B-Drug": 5,
    "I-Drug": 6,
    "B-Procedure": 7,
    "I-Procedure": 8,
    "B-Measurement": 9,
    "I-Measurement": 10,
    "B-Temporal": 11,
    "I-Temporal": 12,
    "B-Observation": 13,
    "I-Observation": 14,
    "B-Person": 15,
    "I-Person": 16,
    "B-Device": 17,
    "I-Device": 18
}

entities_list = list(sel_ent.keys())
sel_ent_inv = {v: k for k, v in sel_ent.items()}

In [5]:
root = '..'
root = './drive/MyDrive/TER-LISN-2024'
data_path = f'{root}/data'
models_path = f'{root}/models'

In [6]:
model_name = "roberta-base"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# tokenize and align the labels in the dataset
def tokenize_and_align_labels(sentence, tokenizer, flag = 'I'):
    """
    Tokenize the sentence and align the labels
    inputs:
        sentence: dict, the sentence from the dataset
        flag: str, the flag to indicate how to deal with the labels for subwords
            - 'I': use the label of the first subword for all subwords but as intermediate (I-ENT)
            - 'B': use the label of the first subword for all subwords as beginning (B-ENT)
            - None: use -100 for subwords
    outputs:
        tokenized_sentence: dict, the tokenized sentence now with a field for the labels
    """
    tokenized_sentence = tokenizer(sentence['tokens'], is_split_into_words=True, truncation=True, padding='max_length', max_length=512)

    labels = []
    all_word_ids = []
    for i, labels_s in enumerate(sentence['ner_tags']):
        word_ids = tokenized_sentence.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # if the word_idx is None, assign -100
            if word_idx is None:
                label_ids.append(-100)
            # if it is a new word, assign the corresponding label
            elif word_idx != previous_word_idx:
                label_ids.append(labels_s[word_idx])
            # if it is the same word, check the flag to assign
            else:
                if flag == 'I':
                    if entities_list[labels_s[word_idx]].startswith('I'):
                      label_ids.append(labels_s[word_idx])
                    else:
                      label_ids.append(labels_s[word_idx] + 1)
                elif flag == 'B':
                    label_ids.append(labels_s[word_idx])
                elif flag == None:
                    label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        all_word_ids.append(word_ids)
    tokenized_sentence['labels'] = labels
    tokenized_sentence['word_ids'] = all_word_ids
    return tokenized_sentence

In [9]:
dataset = load_dataset('JavierLopetegui/chia_v1')

In [10]:
# tokenize and align the labels in the dataset
dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, 'I'), batched = True)

Map:   0%|          | 0/1307 [00:00<?, ? examples/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'file', 'index', 'input_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 8881
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'file', 'index', 'input_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 1307
    })
    val: Dataset({
        features: ['tokens', 'ner_tags', 'file', 'index', 'input_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 2221
    })
})

In [12]:
# load model
model = torch.load(f'{models_path}/roberta-ner-chia.pt')

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
data_for_model = dataset['test'].remove_columns(['file', 'tokens', 'labels', 'index', 'ner_tags', 'word_ids'])

In [15]:
data_for_model

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1307
})

In [16]:
data_loader = torch.utils.data.DataLoader(data_for_model, batch_size=8)

In [17]:
model.to(device)

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [18]:
from tqdm import tqdm

In [19]:
len(data_loader.dataset[2]['attention_mask'])

512

In [20]:
labels = []
for batch in tqdm(data_loader):

    batch['input_ids'] = torch.LongTensor(np.column_stack(np.array(batch['input_ids']))).to(device)
    batch['attention_mask'] = torch.LongTensor(np.column_stack(np.array(batch['attention_mask']))).to(device)
    batch_tokenizer = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
    # break
    with torch.no_grad():
        outputs = model(**batch_tokenizer)

    labels_batch = torch.argmax(outputs.logits, dim=2).to('cpu').numpy()
    labels.extend([list(labels_batch[i]) for i in range(labels_batch.shape[0])])

    del batch
    del outputs
    torch.cuda.empty_cache()

100%|██████████| 164/164 [00:50<00:00,  3.22it/s]


In [21]:
def annotate_sentences(dataset, labels, entities_list,criteria = 'first_label'):
    """
    Annotate the sentences with the predicted labels
    inputs:
        dataset: dataset, dataset with the sentences
        labels: list, list of labels
        entities_list: list, list of entities
        criteria: str, criteria to use to select the label when the words pices have different labels
            - first_label: select the first label
            - majority: select the label with the majority
    outputs:
        annotated_sentences: list, list of annotated sentences
    """
    annotated_sentences = []
    for i in range(len(dataset)):
        # get just the tokens different from None
        sentence = dataset[i]
        word_ids = sentence['word_ids']
        sentence_labels = labels[i]
        annotated_sentence = [[] for _ in range(len(dataset[i]['tokens']))]
        for word_id, label in zip(word_ids, sentence_labels):
            if word_id is not None:
                annotated_sentence[word_id].append(label)
        annotated_sentence_filtered = []
        if criteria == 'first_label':
            annotated_sentence_filtered = [annotated_sentence[i][0] for i in range(len(annotated_sentence))]
        elif criteria == 'majority':
            annotated_sentence_filtered = []
            for j in range(len(annotated_sentence)):
                starts_flag = entities_list[annotated_sentence[j][0]].startswith('B')

                ent = max(set(annotated_sentence[j]), key=annotated_sentence[j].count)
                if starts_flag and ent != 0:
                    label = entities_list[ent][2:]
                    label = 'B-' + label
                    annotated_sentence_filtered.append(sel_ent[label])
                else:
                    annotated_sentence_filtered.append(ent)
        annotated_sentences.append(annotated_sentence_filtered)
    return annotated_sentences

In [22]:
annotated_sentences_first = annotate_sentences(dataset['test'], labels, entities_list, criteria='first_label')
annotated_sentences_max = annotate_sentences(dataset['test'], labels, entities_list, criteria='majority')

In [None]:
#load seqeval metric for evaluation
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
def compute_metrics_tr(p):
    """
    Compute the metrics for the model
    inputs:
        p: tuple, the predictions and the labels
    outputs:
        dict: the metrics
    """
    predictions, labels = p

    # Remove ignored index (special tokens)
    true_predictions = [
        [entities_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [entities_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    resutls_strict = metric.compute(predictions=true_predictions, references=true_labels, mode='strict', scheme='IOB2')

    cr1 = classification_report(true_labels, true_predictions)
    cr2 = classification_report(true_labels, true_predictions, mode='strict', scheme=IOB2)

    return results, resutls_strict,cr1,cr2

In [23]:
def get_labels(p):
    predictions, labels = p
    # Remove ignored index (special tokens)
    predictions = [
        [entities_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    labels = [
        [entities_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return predictions, labels






In [24]:
pred_labels, true_labels = get_labels((annotated_sentences_first, dataset['test']['ner_tags']))

In [40]:
pred_labels[0]

['O',
 'O',
 'O',
 'B-Condition',
 'B-Person',
 'B-Value',
 'O',
 'I-Value',
 'I-Value',
 'I-Value',
 'I-Value',
 'I-Value',
 'O',
 'O',
 'O',
 'I-Observation',
 'O',
 'O']

In [32]:
# from eval_file import *

import argparse
from collections import defaultdict
from itertools import chain
from math import pow
from pathlib import Path

# from common_utils.common_io import load_bio_file_into_sents
# from common_utils.common_log import create_logger
# -*- coding: utf-8 -*-

# -*- coding: utf-8 -*-

import json
import pickle as pkl


def read_from_file(ifn):
    with open(ifn, "r") as f:
        text = f.read()
    return text


def write_to_file(text, ofn):
    with open(ofn, "w") as f:
        f.write(text)
    return True


def pkl_load(ifn):
    with open(ifn, "rb") as f:
        pdata = pkl.load(f)
    return pdata


def pkl_dump(pdata, ofn):
    with open(ofn, "wb") as f:
        pkl.dump(pdata, f)
    return True


def json_load(ifn):
    with open(ifn, "r") as f:
        jdata = json.load(f)
    return jdata


def json_dump(jdata, ofn):
    with open(ofn, "w") as f:
        json.dump(jdata, f)
    return True


def load_bio_file_into_sents(bio_file, word_sep=" ", do_lower=False):
    bio_text = read_from_file(bio_file)
    bio_text = bio_text.strip()
    if do_lower:
        bio_text = bio_text.lower()

    new_sents = []
    sents = bio_text.split("\n\n")

    for sent in sents:
        new_sent = []
        words = sent.split("\n")
        for word in words:
            new_word = word.split(word_sep)
            new_sent.append(new_word)
        new_sents.append(new_sent)

    return new_sents


def output_bio(bio_data, output_file, sep=" "):
    with open(output_file, "w") as f:
        for sent in bio_data:
            for word in sent:
                line = sep.join(word)
                f.write(line)
                f.write("\n")
            f.write("\n")


class PRF:
    def __init__(self):
        self.true = 0
        self.false = 0

    def add_true_case(self):
        self.true += 1

    def add_false_case(self):
        self.false += 1

    def get_true_false_counts(self):
        return self.true, self.false

    def __str__(self):
        return str(self.__dict__)


class BioEval:
    def __init__(self):
        self.acc = PRF()
        # prediction
        self.all_strict = PRF()
        self.all_relax = PRF()
        self.cat_strict = defaultdict(PRF)
        self.cat_relax = defaultdict(PRF)
        # gold standard
        self.gs_all = 0
        self.gs_cat = defaultdict(int)
        self.performance = dict()
        self.counts = dict()
        self.beta = 1
        self.label_not_for_eval = {'o'}

    def reset(self):
        self.acc = PRF()
        self.all_strict = PRF()
        self.all_relax = PRF()
        self.cat_strict = defaultdict(PRF)
        self.cat_relax = defaultdict(PRF)
        self.gs_all = 0
        self.gs_cat = defaultdict(int)
        self.performance = dict()
        self.counts = dict()

    def set_beta_for_f_score(self, beta):
        print("Using beta={} for calculating F-score".format(beta))
        self.beta = beta

    # def set_logger(self, logger):
    #     self.logger = logger

    def add_labels_not_for_eval(self, *labels):
        for each in labels:
            self.label_not_for_eval.add(each.lower())

    def __calc_prf(self, tp, fp, tp_tn):
        """
        Using this function to calculate F-beta score, beta=1 is f_score-score, set beta=2 favor recall, and set beta=0.5 favor precision.
        Using set_beta_for_f_score function to change beta value.
        """
        tp_fp = tp + fp
        pre = 1.0 * tp / tp_fp if tp_fp > 0 else 0.0
        rec = 1.0 * tp / tp_tn if tp_tn > 0 else 0.0
        beta2 = pow(self.beta, 2)
        f_beta = (1 + beta2) * pre * rec / (beta2 * pre + rec) if (pre + rec) > 0 else 0.0
        return pre, rec, f_beta

    def __measure_performance(self):
        self.performance['overall'] = dict()

        acc_true_num, acc_false_num = self.acc.get_true_false_counts()
        total_acc_num = acc_true_num + acc_false_num
        # calc acc
        overall_acc = round(1.0 * acc_true_num / total_acc_num, 4) if total_acc_num > 0 else 0.0
        self.performance['overall']['acc'] = overall_acc

        strict_true_counts, strict_false_counts = self.all_strict.get_true_false_counts()
        strict_pre, strict_rec, strict_f_score = self.__calc_prf(strict_true_counts, strict_false_counts, self.gs_all)
        self.performance['overall']['strict'] = dict()
        self.performance['overall']['strict']['precision'] = strict_pre
        self.performance['overall']['strict']['recall'] = strict_rec
        self.performance['overall']['strict']['f_score'] = strict_f_score

        relax_true_counts, relax_false_counts = self.all_relax.get_true_false_counts()
        relax_pre, relax_rec, relax_f_score = self.__calc_prf(relax_true_counts, relax_false_counts, self.gs_all)
        self.performance['overall']['relax'] = dict()
        self.performance['overall']['relax']['precision'] = relax_pre
        self.performance['overall']['relax']['recall'] = relax_rec
        self.performance['overall']['relax']['f_score'] = relax_f_score

        self.performance['category'] = dict()
        self.performance['category']['strict'] = dict()
        for k, v in self.cat_strict.items():
            self.performance['category']['strict'][k] = dict()
            stc, sfc = v.get_true_false_counts()
            p, r, f = self.__calc_prf(stc, sfc, self.gs_cat[k])
            self.performance['category']['strict'][k]['precision'] = p
            self.performance['category']['strict'][k]['recall'] = r
            self.performance['category']['strict'][k]['f_score'] = f

        self.performance['category']['relax'] = dict()
        for k, v in self.cat_relax.items():
            self.performance['category']['relax'][k] = dict()
            rtc, rfc = v.get_true_false_counts()
            p, r, f = self.__calc_prf(rtc, rfc, self.gs_cat[k])
            self.performance['category']['relax'][k]['precision'] = p
            self.performance['category']['relax'][k]['recall'] = r
            self.performance['category']['relax'][k]['f_score'] = f

    def __measure_counts(self):
        # gold standard
        self.counts['expect'] = dict()
        self.counts['expect']['overall'] = self.gs_all
        for k, v in self.gs_cat.items():
            self.counts['expect'][k] = v
        # prediction
        self.counts['prediction'] = {'strict': dict(), 'relax': dict()}
        # strict
        strict_true_counts, strict_false_counts = self.all_strict.get_true_false_counts()
        self.counts['prediction']['strict']['overall'] = dict()
        self.counts['prediction']['strict']['overall']['total'] = strict_true_counts + strict_false_counts
        self.counts['prediction']['strict']['overall']['true'] = strict_true_counts
        self.counts['prediction']['strict']['overall']['false'] = strict_false_counts
        for k, v in self.cat_strict.items():
            t, f = v.get_true_false_counts()
            self.counts['prediction']['strict'][k] = dict()
            self.counts['prediction']['strict'][k]['total'] = t + f
            self.counts['prediction']['strict'][k]['true'] = t
            self.counts['prediction']['strict'][k]['false'] = f
        # relax
        relax_true_counts, relax_false_counts = self.all_relax.get_true_false_counts()
        self.counts['prediction']['relax']['overall'] = dict()
        self.counts['prediction']['relax']['overall']['total'] = relax_true_counts + relax_false_counts
        self.counts['prediction']['relax']['overall']['true'] = relax_true_counts
        self.counts['prediction']['relax']['overall']['false'] = relax_false_counts
        for k, v in self.cat_relax.items():
            t, f = v.get_true_false_counts()
            self.counts['prediction']['relax'][k] = dict()
            self.counts['prediction']['relax'][k]['total'] = t + f
            self.counts['prediction']['relax'][k]['true'] = t
            self.counts['prediction']['relax'][k]['false'] = f

    @staticmethod
    def __strict_match(gs, pred, s_idx, e_idx, en_type):
        if e_idx < len(gs) and gs[e_idx] == f"i-{en_type}":
            # check token after end in GS is not continued entity token
            return False
        elif gs[s_idx] != f"b-{en_type}" or pred[s_idx] != f"b-{en_type}":
            # force first token to be B-
            return False
        # check every token in span is the same
        for idx in range(s_idx, e_idx):
            if gs[idx] != pred[idx]:
                return False
        return True

    @staticmethod
    def __relax_match(gs, pred, s_idx, e_idx, en_type):
        # we adopt the partial match strategy which is very loose compare to right-left or approximate match
        for idx in range(s_idx, e_idx):
            gs_cate = gs[idx].split("-")[-1]
            pred_bound, pred_cate = pred[idx].split("-")
            if gs_cate == pred_cate == en_type:
                return True
        return False

    @staticmethod
    def __check_evaluated_already(gs_dict, cate, start_idx, end_idx):
        for k, v in gs_dict.items():
            c, s, e = k
            if not (e < start_idx or s > end_idx) and c == cate:
                if v == 0:
                    return True
                else:
                    gs_dict[k] -= 1
                    return False
        return False

    def __process_bio(self, gs_bio, pred_bio):
        # measure acc
        for w_idx, (gs_word, pred_word) in enumerate(zip(gs_bio, pred_bio)):
            # measure acc
            if gs_word == pred_word:
                self.acc.add_true_case()
            else:
                self.acc.add_false_case()

        # process gold standard
        llen = len(gs_bio)
        gs_dict = defaultdict(int)
        cur_idx = 0
        while cur_idx < llen:
            if gs_bio[cur_idx].strip() in self.label_not_for_eval:
                cur_idx += 1
            else:
                start_idx = cur_idx
                end_idx = start_idx + 1
                _, cate = gs_bio[start_idx].strip().split('-')
                while end_idx < llen and gs_bio[end_idx].strip() == f"i-{cate}":
                    end_idx += 1
                self.gs_all += 1
                self.gs_cat[cate] += 1
                gs_dict[(cate, start_idx, end_idx)] += 1
                cur_idx = end_idx
        # process predictions
        cur_idx = 0
        while cur_idx < llen:
            if pred_bio[cur_idx].strip() in self.label_not_for_eval:
                cur_idx += 1
            else:
                start_idx = cur_idx
                end_idx = start_idx + 1
                _, cate = pred_bio[start_idx].strip().split("-")
                while end_idx < llen and pred_bio[end_idx].strip() == f"i-{cate}":
                    end_idx += 1
                if self.__strict_match(gs_bio, pred_bio, start_idx, end_idx, cate):
                    self.all_strict.add_true_case()
                    self.cat_strict[cate].add_true_case()
                    self.all_relax.add_true_case()
                    self.cat_relax[cate].add_true_case()
                elif self.__relax_match(gs_bio, pred_bio, start_idx, end_idx, cate):
                    if self.__check_evaluated_already(gs_dict, cate, start_idx, end_idx):
                        cur_idx = end_idx
                        continue
                    self.all_strict.add_false_case()
                    self.cat_strict[cate].add_false_case()
                    self.all_relax.add_true_case()
                    self.cat_relax[cate].add_true_case()
                else:
                    self.all_strict.add_false_case()
                    self.cat_strict[cate].add_false_case()
                    self.all_relax.add_false_case()
                    self.cat_relax[cate].add_false_case()
                cur_idx = end_idx

    def eval_file(self, gs_file, pred_file):
        print("processing gold standard file: {} and prediciton file: {}".format(gs_file, pred_file))
        pred_bio_sents = load_bio_file_into_sents(pred_file, do_lower=True)
        gs_bio_sents = load_bio_file_into_sents(gs_file, do_lower=True)
        # process bio data
        # check two data have same amount of sents
        assert len(gs_bio_sents) == len(pred_bio_sents), \
            "gold standard and prediction have different dimension: gs: {}; pred: {}".format(len(gs_bio_sents), len(pred_bio_sents))
        # measure performance
        for s_idx, (gs_sent, pred_sent) in enumerate(zip(gs_bio_sents, pred_bio_sents)):
            # check two sents have same No. of words
            assert len(gs_sent) == len(pred_sent), \
                "In {}th sentence, the words counts are different; gs: {}; pred: {}".format(s_idx, gs_sent, pred_sent)
            gs_sent = list(map(lambda x: x[-1], gs_sent))
            pred_sent = list(map(lambda x: x[-1], pred_sent))
            self.__process_bio(gs_sent, pred_sent)
        # get the evaluation matrix
        self.__measure_performance()
        self.__measure_counts()

    def eval_mem(self, gs, pred, do_flat=False):
        # flat sents to sent; we assume input sequences only have 1 dimension (only labels)
        if do_flat:
            print('Sentences have been flatten to 1 dim.')
            gs = list(chain(*gs))
            pred = list(chain(*pred))
            gs = list(map(lambda x: x.lower(), gs))
            pred = list(map(lambda x: x.lower(), pred))
            self.__process_bio(gs, pred)
        else:
            for sidx, (gs_s, pred_s) in enumerate(zip(gs, pred)):
                gs_s = list(map(lambda x: x.lower(), gs_s))
                pred_s = list(map(lambda x: x.lower(), pred_s))
                self.__process_bio(gs_s, pred_s)

        self.__measure_performance()
        self.__measure_counts()

    def evaluate_annotations(self, gs, pred, do_lower=False):
        for gs_sent, pred_sent in zip(gs, pred):
            if do_lower:
              gs_sent = list(map(lambda x: x.lower(), gs_sent))
              pred_sent = list(map(lambda x: x.lower(), pred_sent))
            self.__process_bio(gs_sent, pred_sent)

        self.__measure_performance()
        self.__measure_counts()

    def get_performance(self):
        return self.performance

    def get_counts(self):
        return self.counts

    def save_evaluation(self, file):
        with open(file, "w") as f:
            json.dump(self.performance, f)

    def show_evaluation(self, digits=4):
        if len(self.performance) == 0:
            raise RuntimeError('call eval_mem() first to get the performance attribute')

        cate = self.performance['category']['strict'].keys()

        headers = ['precision', 'recall', 'f1']
        width = max(max([len(c) for c in cate]), len('overall'), digits)
        head_fmt = '{:>{width}s} ' + ' {:>9}' * len(headers)

        report = head_fmt.format(u'', *headers, width=width)
        report += '\n\nstrict\n'

        row_fmt = '{:>{width}s} ' + ' {:>9.{digits}f}' * 3 + '\n'
        for c in cate:
            precision = self.performance['category']['strict'][c]['precision']
            recall = self.performance['category']['strict'][c]['recall']
            f1 = self.performance['category']['strict'][c]['f_score']
            report += row_fmt.format(c, *[precision, recall, f1], width=width, digits=digits)

        report += '\nrelax\n'

        for c in cate:
            precision = self.performance['category']['relax'][c]['precision']
            recall = self.performance['category']['relax'][c]['recall']
            f1 = self.performance['category']['relax'][c]['f_score']
            report += row_fmt.format(c, *[precision, recall, f1], width=width, digits=digits)

        report += '\n\noverall\n'
        report += 'acc: ' + str(self.performance['overall']['acc'])
        report += '\nstrict\n'
        report += row_fmt.format('', *[self.performance['overall']['strict']['precision'],
                                       self.performance['overall']['strict']['recall'],
                                       self.performance['overall']['strict']['f_score']], width=width, digits=digits)

        report += '\nrelax\n'
        report += row_fmt.format('', *[self.performance['overall']['relax']['precision'],
                                       self.performance['overall']['relax']['recall'],
                                       self.performance['overall']['relax']['f_score']], width=width, digits=digits)
        return report


In [None]:
s = "i-"

In [33]:
evaluator = BioEval()

In [34]:
evaluator.evaluate_annotations(true_labels, pred_labels, do_lower=True)

In [38]:
evaluator.performance

{'overall': {'acc': 0.8351,
  'strict': {'precision': 0.6225968648328897,
   'recall': 0.6740313800832533,
   'f_score': 0.6472939729397292},
  'relax': {'precision': 0.7580597456373854,
   'recall': 0.8206852385526737,
   'f_score': 0.7881303813038131}},
 'category': {'strict': {'condition': {'precision': 0.6648394675019577,
    'recall': 0.7683257918552037,
    'f_score': 0.7128463476070528},
   'person': {'precision': 0.7133757961783439,
    'recall': 0.8296296296296296,
    'f_score': 0.7671232876712328},
   'value': {'precision': 0.7067039106145251,
    'recall': 0.7207977207977208,
    'f_score': 0.7136812411847672},
   'drug': {'precision': 0.7180043383947939,
    'recall': 0.7471783295711061,
    'f_score': 0.7323008849557522},
   'temporal': {'precision': 0.49279538904899134,
    'recall': 0.5757575757575758,
    'f_score': 0.5310559006211181},
   'measurement': {'precision': 0.5473372781065089,
    'recall': 0.6379310344827587,
    'f_score': 0.5891719745222931},
   'procedur

In [40]:
evaluator.save_evaluation('eval.json')

In [None]:
results, results_strict,cr1,cr2 = compute_metrics_tr((annotated_sentences_first, dataset['test']['ner_tags']))

In [None]:
print(cr1)

              precision    recall  f1-score   support

   Condition       0.64      0.77      0.70      1105
      Device       0.24      0.30      0.27        23
        Drug       0.68      0.73      0.70       443
 Measurement       0.53      0.62      0.57       290
 Observation       0.30      0.18      0.23       166
      Person       0.76      0.84      0.80       135
   Procedure       0.46      0.49      0.48       313
    Temporal       0.48      0.58      0.52       297
       Value       0.65      0.70      0.68       351

   micro avg       0.60      0.67      0.63      3123
   macro avg       0.53      0.58      0.55      3123
weighted avg       0.59      0.67      0.62      3123



In [None]:
print(cr2)

              precision    recall  f1-score   support

   Condition       0.69      0.76      0.72      1104
      Device       0.29      0.30      0.30        23
        Drug       0.73      0.73      0.73       443
 Measurement       0.59      0.61      0.60       288
 Observation       0.40      0.17      0.24       166
      Person       0.76      0.84      0.80       135
   Procedure       0.53      0.49      0.51       311
    Temporal       0.58      0.57      0.58       295
       Value       0.70      0.72      0.71       345

   micro avg       0.66      0.66      0.66      3110
   macro avg       0.59      0.58      0.58      3110
weighted avg       0.65      0.66      0.65      3110

