In [15]:
import copy
import spacy
import pickle

import numpy as np
from tqdm import tqdm_notebook as tqdm 

In [2]:
import en_core_web_trf
nlp = en_core_web_trf.load()

In [3]:
filename = '../../data/ner_testing.pkl'
infile = open(filename,'rb')
ner_testing_data = pickle.load(infile)
infile.close()

In [4]:
ner_testing_data[:5]

[{'string': 'EU rejects German call to boycott British lamb .',
  'ents': [{'ent': 'ORG', 'start': 0, 'end': 2},
   {'ent': 'MISC', 'start': 11, 'end': 17},
   {'ent': 'MISC', 'start': 34, 'end': 41}]},
 {'string': 'Peter Blackburn',
  'ents': [{'ent': 'PER', 'start': 0, 'end': 15}]},
 {'string': 'BRUSSELS 1996-08-22',
  'ents': [{'ent': 'LOC', 'start': 0, 'end': 8}]},
 {'string': 'The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .',
  'ents': [{'ent': 'ORG', 'start': 4, 'end': 23},
   {'ent': 'MISC', 'start': 59, 'end': 65},
   {'ent': 'MISC', 'start': 94, 'end': 101}]},
 {'string': "Germany's representative to the European Union's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .",
  'ents': [{'ent': 'LOC', 'start': 0, 'end': 6},
   {'ent

In [5]:
def change_test_data_entities_names(ent_and_string):
    for ent in ent_and_string['ents']:
        if ent['ent'][-3:] == "ORG":
             ent['ent'] = "ORG"
        elif ent['ent'][-3:] == "PER":
             ent['ent'] = "PER"
        elif ent['ent'][-3:] == "LOC":
             ent['ent'] = "LOC"
        elif ent['ent'][-4:] == "MISC":
             ent['ent'] = "NORP"
    return ent_and_string

ner_testing_data = [change_test_data_entities_names(ent_and_string) for ent_and_string in ner_testing_data]

In [6]:
ner_testing_data[:5]

[{'string': 'EU rejects German call to boycott British lamb .',
  'ents': [{'ent': 'ORG', 'start': 0, 'end': 2},
   {'ent': 'NORP', 'start': 11, 'end': 17},
   {'ent': 'NORP', 'start': 34, 'end': 41}]},
 {'string': 'Peter Blackburn',
  'ents': [{'ent': 'PER', 'start': 0, 'end': 15}]},
 {'string': 'BRUSSELS 1996-08-22',
  'ents': [{'ent': 'LOC', 'start': 0, 'end': 8}]},
 {'string': 'The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .',
  'ents': [{'ent': 'ORG', 'start': 4, 'end': 23},
   {'ent': 'NORP', 'start': 59, 'end': 65},
   {'ent': 'NORP', 'start': 94, 'end': 101}]},
 {'string': "Germany's representative to the European Union's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .",
  'ents': [{'ent': 'LOC', 'start': 0, 'end': 6},
   {'ent

In [7]:
spacy_ner_predictions = []

for sentence in tqdm(ner_testing_data[:4000]):
    spacy_ents_dict = {"string": sentence['string'], "ents": []}
    for ent in nlp(sentence['string']).ents:
        spacy_ents_dict["ents"].append({
            "start": ent.start_char,
            "end": ent.end_char,
            "ent":ent.label_
        })
    spacy_ner_predictions.append(spacy_ents_dict)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/4000 [00:00<?, ?it/s]

In [8]:
spacy_ner_predictions[:5]

[{'string': 'EU rejects German call to boycott British lamb .',
  'ents': [{'start': 0, 'end': 2, 'ent': 'ORG'},
   {'start': 11, 'end': 17, 'ent': 'NORP'},
   {'start': 34, 'end': 41, 'ent': 'NORP'}]},
 {'string': 'Peter Blackburn',
  'ents': [{'start': 0, 'end': 15, 'ent': 'PERSON'}]},
 {'string': 'BRUSSELS 1996-08-22',
  'ents': [{'start': 9, 'end': 19, 'ent': 'DATE'}]},
 {'string': 'The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .',
  'ents': [{'start': 0, 'end': 23, 'ent': 'ORG'},
   {'start': 32, 'end': 40, 'ent': 'DATE'},
   {'start': 59, 'end': 65, 'ent': 'NORP'},
   {'start': 94, 'end': 101, 'ent': 'NORP'}]},
 {'string': "Germany's representative to the European Union's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .",
  'ents'

In [9]:
def change_spacy_entities_names(ent_and_string):
    new_ent_and_string = {
        "string": ent_and_string["string"],
        "ents": []
    }
    for ent in ent_and_string['ents']:
        if ent['ent'] == "ORG":
             new_ent_and_string['ents'].append(ent)
        elif ent['ent'] == "PERSON":
             ent['ent'] = "PER"
             new_ent_and_string['ents'].append(ent)
        elif ent['ent'] == "GPE":
             ent['ent'] = "LOC"
             new_ent_and_string['ents'].append(ent)
        elif ent['ent'] == "NORP":
             new_ent_and_string['ents'].append(ent)
    return new_ent_and_string

In [10]:
filtered_spacy_ner_predictions = [change_spacy_entities_names(ent_and_string) for ent_and_string in spacy_ner_predictions]

In [11]:
filtered_spacy_ner_predictions[:5]

[{'string': 'EU rejects German call to boycott British lamb .',
  'ents': [{'start': 0, 'end': 2, 'ent': 'ORG'},
   {'start': 11, 'end': 17, 'ent': 'NORP'},
   {'start': 34, 'end': 41, 'ent': 'NORP'}]},
 {'string': 'Peter Blackburn',
  'ents': [{'start': 0, 'end': 15, 'ent': 'PER'}]},
 {'string': 'BRUSSELS 1996-08-22', 'ents': []},
 {'string': 'The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .',
  'ents': [{'start': 0, 'end': 23, 'ent': 'ORG'},
   {'start': 59, 'end': 65, 'ent': 'NORP'},
   {'start': 94, 'end': 101, 'ent': 'NORP'}]},
 {'string': "Germany's representative to the European Union's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .",
  'ents': [{'start': 0, 'end': 7, 'ent': 'LOC'},
   {'start': 28, 'end': 48, 'ent': 'ORG'},
   

In [12]:
def get_size_intersection(ent_1, ent_2):
    return max(0, min(ent_1["end"], ent_2["end"]) - max(ent_1["start"], ent_2["start"]))

def find_one_best_match(ent_1, list_of_entities_2):
    max_overlap = 0
    index_max_overlap = -1
    for i, ent_2 in enumerate(list_of_entities_2):
        if ent_2['ent'] == ent_1['ent']:
            intersection_ent_1_ent_2 = get_size_intersection(ent_1, ent_2)
            if intersection_ent_1_ent_2 > max_overlap:
                index_max_overlap = i
                max_overlap = intersection_ent_1_ent_2
    return index_max_overlap, max_overlap

def get_size_all_entities(list_of_entities):
    size_all_enetities = 0
    for ent in list_of_entities:
        size_all_enetities += ent["end"] - ent["start"]
    return size_all_enetities

def jacquard_metrics(list_of_entities_1, list_of_entities_2):
    list_of_entities_1 = copy.deepcopy(list_of_entities_1)
    list_of_entities_2 = copy.deepcopy(list_of_entities_2)
    size_of_intersection = 0
    nb_intersec = 0
    len1, len2 = len(list_of_entities_1), len(list_of_entities_2)
    size_of_union = get_size_all_entities(list_of_entities_1) + get_size_all_entities(list_of_entities_2)
    for ent_1 in list_of_entities_1:
        index_max_overlap, max_overlap = find_one_best_match(ent_1, list_of_entities_2)
        if index_max_overlap >= 0:
            size_of_intersection += max_overlap
            list_of_entities_2.pop(index_max_overlap)
            nb_intersec += 1
    size_of_union -= size_of_intersection # #AUB = #A + #B - #AinterB
    
    return [size_of_intersection / size_of_union if (size_of_union > 0) else 1,
            size_of_intersection, size_of_union, nb_intersec / len1 if len1 > 0 else 1, nb_intersec / len2 if len2 > 0 else 1]

In [13]:
list_of_jacquard = []

for gt_ent, spacy_ent in zip(ner_testing_data, filtered_spacy_ner_predictions):
    list_of_jacquard.append(jacquard_metrics(gt_ent['ents'], spacy_ent['ents']))

In [16]:
np.mean(np.array(list_of_jacquard)[:, 0])

0.6937174152441185

In [17]:
np.sum(np.array(list_of_jacquard)[:, 1]) / np.sum(np.array(list_of_jacquard)[:, 2])

0.6506754703428461

In [18]:
np.mean(np.array(list_of_jacquard)[:, 3])

0.7141357697245564

In [19]:
np.mean(np.array(list_of_jacquard)[:, 4])

0.917631001984127