Script to find the exact match and F1 scores on **version 1 of clicr**

In [7]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/'My Drive'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive/My Drive


In [8]:
import sys
sys.path.append('Data')
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [0]:
import re
import json
import datetime
import os
import pprint
import random
import string

def normalize_answer(s, lemmatizer_comm=None):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    def lemma(text):
        if lemmatizer_comm is None:
            lemmatized = text
        else:
            lemmatized = lemmatize(text, lemmatizer_comm)
        return lemmatized

    return white_space_fix(remove_articles(remove_punc(lower(lemma(s)))))

In [0]:
def load_json(filename):
    with open(filename) as in_f:
        return json.load(in_f)

DATA_KEY = "data"
VERSION_KEY = "version"
DOC_KEY = "document"
QAS_KEY = "qas"
ANS_KEY = "answers"
TXT_KEY = "text"  # the text part of the answer
ORIG_KEY = "origin"
ID_KEY = "id"
TITLE_KEY = "title"
CONTEXT_KEY = "context"
SOURCE_KEY = "source"
QUERY_KEY = "query"
CUI_KEY = "cui"
SEMTYPE_KEY = "sem_type"
ISIMPOSSIBLE_KEY = "is_impossible"
PLACEHOLDER_KEY = "@placeholder"

In [0]:
def f1_score(prediction, ground_truth, comm=None):
    prediction_tokens = normalize_answer(prediction, comm).split()
    ground_truth_tokens = normalize_answer(ground_truth, comm).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth, comm=None):
    return normalize_answer(prediction, comm) == normalize_answer(ground_truth, comm)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths, comm=None):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth, comm)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

In [0]:
from collections import  Counter

def evaluate(dataset, predictions, lemmatizer_path=None, extended=False, embeddings_file=None, downcase=False):
    data = dataset[DATA_KEY]
    comm = lemmatizer(lemmatizer_path) if lemmatizer_path else None
    f1 = exact_match = total = 0

    n_unanswered = 0
    datum_count = 0
    
    for datum in data:
      for qa in datum[DOC_KEY][QAS_KEY]:
          ground_truths = []
          total += 1
          if qa[ID_KEY] not in predictions:
              n_unanswered += 1
              continue    
          ground_truths = list(map(lambda x: x[TXT_KEY], qa[ANS_KEY]))
          prediction = predictions[qa[ID_KEY]]
          exact_match += metric_max_over_ground_truths(
              exact_match_score, prediction, ground_truths, comm=comm)
          f1 += metric_max_over_ground_truths(
              f1_score, prediction, ground_truths, comm=comm)
      datum_count += 1
    
    print("There were {} unanswered instances".format(n_unanswered))
    exact_match = 100.0 * exact_match / total
    print(total)
    f1 = 100.0 * f1 / total
    # assert exact_match <= f1
    scores = {'exact_match': exact_match, 'f1': f1}
    return scores

In [0]:
def print_scores(scores):
    """
    :param scores: {"method1": score, ...}
    """
    print("{}\t{:.1f}".format("exact_match", scores["exact_match"]))
    print("{}\t{:.1f}".format("f1", scores["f1"]))

    for method, score in sorted(scores.items()):
        if method == "exact_match" or method == "f1":
            continue
        else:
            print("{}\t{:.3f}".format(method, score))

In [14]:
dataset = load_json('New-Output-III/bert_dev.json')       # path for the development json file
predictions = load_json('Data/output_biobertlarge_summarized_1.0_predictions.json')   # path for the predictions file
print_scores(evaluate(dataset, predictions))

There were 1048 unanswered instances
3558
exact_match	30.0
f1	31.8


Command to run Evaluation file that returns the Exact Match and F1 scores for **v2** **of** **clicr**








In [15]:
! python Data/eval2.0.py New-Output-IV/bert_test.json Data/output_biobertbase_pubmed_pmc_2.0_test_predictions.json

{
  "exact": 52.19933184855234,
  "f1": 53.037111753008176,
  "total": 7184,
  "HasAns_exact": 52.19933184855234,
  "HasAns_f1": 53.037111753008176,
  "HasAns_total": 7184
}
