# Entity linking evaluation
You are provided with the documents annotations along with ground truth annotations and asked to evaluate them.

In [None]:
pip install ipytest

In [None]:
import ipytest
import pytest

ipytest.autoconfig()

The annotations given by a entity linking system under evaluation.

In [None]:
LINKED_ENTITIES_1 = [ 
    (0, 'angola', 'wikipedia:Angola'),
    (14, 'multiparty democracy', 'wikipedia:multiparty_democracy'),
    (18, '1992 elections', 'wikipedia:Philippine_general_election,_1992')
]

LINKED_ENTITIES_2 = [
    (5, 'angola', 'wikipedia:Angola'),
    (10, '1975', 'wikipedia:Philippine_general_election,_1992'),
    (13, 'one party', 'wikipedia:Single-party_state')
]

Ground truth annotations (reference annotations).

In [None]:
GROUND_TRUTH_ANNOTATIONS_1 = [ 
    (0, 'angola', 'wikipedia:Angola'),
    (4, 'one-party', 'wikipedia:Single-party_state'),
    (14, 'multiparty democracy', 'wikipedia:multiparty_democracy'),
    (18, '1992 elections', 'wikipedia:Philippine_general_election,_1992')
]

GROUND_TRUTH_ANNOTATIONS_2 = [
    (5, 'angola', 'wikipedia:Angola'),
    (13, 'one party', 'wikipedia:Single-party_state'),
    (14, 'Republic', 'wikipedia:Republic')
]

Set-based metrics where:
- precision is defined as the fraction of correctly linked entities that have been annotated by the system
- recall is defined as fraction of correctly linked entities that should be annotated 
- F-measure is a harmonic mean between precision and recall

In [None]:
def set_based_precision(annotations, relevance_annotations):
  """Computes set-based precision.
  
  Args:
      annotations: All annotations for a set of documents.
      relevance_annotations: All reference (ground truth) annotations for a set of documents.
      
  Returns:
      Set-based precision.    
  """
  return len(set(annotations).intersection(relevance_annotations))/len(annotations)

In [None]:
def set_based_recall(annotations, relevance_annotations):
  """Computes set-based recall.
  
  Args:
      annotations: All annotations for a set of documents.
      relevance_annotations: All reference (ground truth) annotations for a set of documents.
      
  Returns:
      Set-based recall.    
  """
  return len(set(annotations).intersection(relevance_annotations))/len(relevance_annotations)

In [None]:
def f1_score(precision, recall):
  """Computes F-measure.
  
  Args:
      annotations: All annotations for a set of documents.
      relevance_annotations: All reference (ground truth) annotations for a set of documents.
      
  Returns:
      F-measure.    
  """
  return 2 * precision * recall / (precision + recall)

## Metrics over the collection of documents

Micro-averaged - averaged across mentions

In [None]:
import itertools 

def micro_precision(annotations, ground_truth_annotations):
  """Computes micro-averaged precision.
  
  Args:
      annotations: All annotations for a set of documents.
      relevance_annotations: All reference (ground truth) annotations for a set of documents.
      
  Returns:
      Micro-averaged precision.    
  """
  all_annotations = list(itertools.chain(*annotations))
  all_ground_truth_annotations = list(itertools.chain(*ground_truth_annotations))
  return set_based_precision(all_annotations, all_ground_truth_annotations)

In [None]:
def micro_recall(all_annotations, ground_truth_annotations):
  """Computes micro-averaged recall.
  
  Args:
      annotations: All annotations for a set of documents.
      relevance_annotations: All reference (ground truth) annotations for a set of documents.
      
  Returns:
      Micro-averaged recall.    
  """
  all_annotations = list(itertools.chain(*all_annotations))
  all_ground_truth_annotations = list(itertools.chain(*ground_truth_annotations))
  return set_based_recall(all_annotations, all_ground_truth_annotations)

Tests

In [None]:
%%run_pytest[clean]

def test_micro_precision():
  assert micro_precision([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2]) == pytest.approx(5/6, rel=1e-2)

def test_micro_recall():
  assert micro_recall([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2]) == pytest.approx(5/7, rel=1e-2)

def test_micro_f1():
  micro_p = micro_precision([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2])
  micro_r = micro_recall([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2])
  assert f1_score(micro_p, micro_r) == pytest.approx((2 * 5/6 * 5/7) / (5/6 + 5/7), rel=1e-2)

Macro-averaged - averaged across documents

In [None]:
def macro_precision(annotations, ground_truth_annotations):
  """Computes macro-averaged precision.
  
  Args:
      annotations: All annotations for a set of documents.
      relevance_annotations: All reference (ground truth) annotations for a set of documents.
      
  Returns:
      Macro-averaged precision.    
  """
  return sum(set_based_precision(annotation, ground_truth) for annotation, ground_truth 
             in zip(annotations, ground_truth_annotations))/len(ground_truth_annotations)

In [None]:
def macro_recall(annotations, ground_truth_annotations):
  """Computes macro-averaged recall.
  
  Args:
      annotations: All annotations for a set of documents.
      relevance_annotations: All reference (ground truth) annotations for a set of documents.
      
  Returns:
      Macro-averaged recall.    
  """
  return sum(set_based_recall(annotation, ground_truth) for annotation, ground_truth 
             in zip(annotations, ground_truth_annotations))/len(ground_truth_annotations)

Tests

In [None]:
%%run_pytest[clean]

def test_macro_precision():
  assert macro_precision([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2]) == pytest.approx((1 + 2/3)/2, rel=1e-2)

def test_macro_recall():
  assert macro_recall([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2]) == pytest.approx((3/4 + 2/3)/2, rel=1e-2)

def test_macro_f1():
  macro_p = macro_precision([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2])
  macro_r = macro_recall([LINKED_ENTITIES_1, LINKED_ENTITIES_2], [GROUND_TRUTH_ANNOTATIONS_1, GROUND_TRUTH_ANNOTATIONS_2])
  assert f1_score(macro_p, macro_r) == pytest.approx((2 * 5/6 * 17/24) / (5/6 + 17/24), rel=1e-2)