In [1]:
# Evaluator is a library I've created (https://github.com/LLCampos/evaluator)
from Evaluator import Evaluator

import os
import re

# Normalized format is "{index_begin}:{index_end}:{entity_name}". For example:
# "191:200:carcinoma"

# Convert MER annotation into a normalized format
# For example, converts "191\t200\tcarcinoma\n" into "191:200:carcinoma"
def normalize_tsv_mer_annotations(mer_annotation):
    return mer_annotation.strip().replace('\t', ':')

# Convert Standoff annotation into a normalized format
# For example, converts "[40::58]\tHP_0000006 | autosomal dominant\n" into "40:58:autosomal dominant"
def normalize_standoff_annotation(standoff_annotation):
    index_begin = re.findall('\[(.*)::', standoff_annotation)[0]
    index_end = re.findall('::(.*)]', standoff_annotation)[0]
    entity_name = re.findall('\| (.*)', standoff_annotation)[0]
    
    return '{}:{}:{}'.format(index_begin, index_end, entity_name)
    

def micro_evaluation(path_to_gold_annotations, path_to_test_annotations):
    
    document_ids = os.listdir(path_to_test_annotations)
    # Remove extension
    document_ids = map(lambda document_id: document_id.split('.')[0], document_ids)
    
    precisions = []
    recalls = []
    fscores = []
    number_documents = len(document_ids)

    for document_id in document_ids:
        with open(path_to_mer_annotations + document_id + '.tsv') as f:
            mer_annotations_tsv = f.readlines()

        with open(path_to_gold_annotations + document_id) as f:
            gold_annotations_standoff = f.readlines()

        mer_annotations = map(lambda annotation: normalize_tsv_mer_annotations(annotation), mer_annotations_tsv) 
        gold_annotations = map(lambda annotation: normalize_standoff_annotation(annotation), gold_annotations_standoff) 
        ev = Evaluator.Evaluator(gold_terms=set(gold_annotations), pred_terms=set(mer_annotations))

        precisions.append(ev.precision())
        recalls.append(ev.recall())
        fscores.append(ev.f1_score())

    micro_precision = sum(precisions) / number_documents
    micro_recall = sum(recalls)/number_documents
    micro_f1_score = sum(fscores)/number_documents

    print "Micro Precision: " + str(micro_precision)
    print "Micro Recall: " + str(micro_recall)
    print "Micro F1-Score: " + str(micro_f1_score)
    
def macro_evaluation(path_to_gold_annotations, path_to_test_annotations):
    
    document_ids = os.listdir(path_to_test_annotations)
    # Remove extension
    document_ids = map(lambda document_id: document_id.split('.')[0], document_ids)
    # For calculate Macro Evaluatin I use an hack with the Evaluator module

    evaluator_master = Evaluator.Evaluator(set(), set())

    for document_id in document_ids:
        with open(path_to_mer_annotations + document_id + '.tsv') as f:
            mer_annotations_tsv = f.readlines()

        with open(path_to_gold_annotations + document_id) as f:
            gold_annotations_standoff = f.readlines()

        mer_annotations = map(lambda annotation: normalize_tsv_mer_annotations(annotation), mer_annotations_tsv) 
        gold_annotations = map(lambda annotation: normalize_standoff_annotation(annotation), gold_annotations_standoff) 
        ev = Evaluator.Evaluator(gold_terms=set(gold_annotations), pred_terms=set(mer_annotations))

        evaluator_master._y_pred += ev._y_pred
        evaluator_master._y_true += ev._y_true

    macro_precision = evaluator_master.precision()
    macro_recall = evaluator_master.recall()
    macro_f1_score = evaluator_master.f1_score()

    print "Macro Precision: " + str(macro_precision)
    print "Macro Recall: " + str(macro_recall)
    print "Macro F1-Score: " + str(macro_f1_score)

In [3]:
path_to_gold_annotations = 'stand-off/'

# Compare MER Annotations with Gold Standard

In [14]:
path_to_mer_annotations = 'mer_annotations/annotations/'

micro_evaluation(path_to_gold_annotations, path_to_mer_annotations)
print
macro_evaluation(path_to_gold_annotations, path_to_mer_annotations)

Micro Precision: 0.481173529696
Micro Recall: 0.482252086474
Micro F1-Score: 0.448153217091

Macro Precision: 0.511897852583
Macro Recall: 0.467161016949
Macro F1-Score: 0.488507338687


# Compare Aggregated Results with Gold Standard

In [18]:
aggregated_results_path = 'aggregated_results_0.5/'
micro_evaluation(path_to_gold_annotations, aggregated_results_path)
print
macro_evaluation(path_to_gold_annotations, aggregated_results_path)

Micro Precision: 0.479091995221
Micro Recall: 0.511544011544
Micro F1-Score: 0.487340751492

Macro Precision: 0.634615384615
Macro Recall: 0.75
Macro F1-Score: 0.6875
