Evaluate Spacy models for person names, orgs and locations using the Presidio Evaluator framework

Data = `test_February_28_2020`

In [15]:
import spacy

from presidio_evaluator import ModelEvaluator
from presidio_evaluator.data_generator import read_synth_dataset
%reload_ext autoreload
%autoreload 2



Select data for evaluation:

In [16]:
synth_samples = read_synth_dataset("../../data/synth_dataset.json")
print(len(synth_samples))
DATASET = synth_samples

300


In [17]:
from collections import Counter
entity_counter = Counter()
for sample in DATASET:
    for span in sample.spans:
        entity_counter[span.entity_type]+=1

In [18]:
entity_counter

Counter({'PERSON': 174,
         'CREDIT_CARD': 49,
         'LOCATION': 75,
         'ORGANIZATION': 48,
         'US_SSN': 1,
         'EMAIL': 11,
         'BIRTHDAY': 4,
         'TITLE': 4,
         'URL': 8,
         'PHONE_NUMBER': 9,
         'IP_ADDRESS': 3,
         'IBAN': 3,
         'NATIONALITY': 1})

In [19]:
DATASET[1]

Full text: Kotoya Negishi listed his top 20 songs for Entertainment Weekly and had the balls to list this song at #15. (What did he put at #1 you ask? Answer:"Tube Snake Boogie" by Frank Strauser â€“ go figure)
Spans: [Type: PERSON, value: Kotoya Negishi, start: 0, end: 14, Type: PERSON, value: Frank Strauser, start: 170, end: 184]
Tokens: [Kotoya, Negishi, listed, his, top, 20, songs, for, Entertainment, Weekly, and, had, the, balls, to, list, this, song, at, #, 15, ., (, What, did, he, put, at, #, 1, you, ask, ?, Answer:"Tube, Snake, Boogie, ", by, Frank, Strauser, â€, “, go, figure, )]
Tags: ['B-PERSON', 'L-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'L-PERSON', 'O', 'O', 'O', 'O', 'O']

In [20]:
#max length sentence
max([len(sample.tokens) for sample in DATASET])

79

Select models for evaluation:

In [21]:
models = []

en_core_web_lg = r"en_core_web_lg"
#spacy_new_ontonotes28 = r"C:\Users\ommendel\OneDrive - Microsoft\Projects\presidio\Presidio-internal\presidio-evaluator\models\spacy_new_ontonotes28"

#spacy_ft_100 = r"C:\Users\ommendel\OneDrive - Microsoft\Projects\presidio\Presidio-internal\presidio-evaluator\models\spacy_ft_100\model-final"

models = [en_core_web_lg]

Run evaluation on all models:

In [22]:
from presidio_evaluator.spacy_evaluator import SpacyEvaluator

for model in models:
    print("-----------------------------------")
    print("Evaluating model {}".format(model))
    nlp = spacy.load(model)
    spacy_evaluator = SpacyEvaluator(model=nlp,entities_to_keep=['PERSON','GPE','ORG'])
    evaluation_results = spacy_evaluator.evaluate_all(DATASET)
    scores = spacy_evaluator.calculate_score(evaluation_results)
    
    print("Confusion matrix:")
    print(scores.results)

    print("Precision and recall")
    scores.print()
    errors = scores.model_errors

-----------------------------------
Evaluating model en_core_web_lg
Translating entites using this dictionary: {'ORGANIZATION': 'ORG', 'COUNTRY': 'GPE', 'CITY': 'GPE', 'LOCATION': 'GPE', 'PERSON': 'PERSON', 'FIRST_NAME': 'PERSON', 'LAST_NAME': 'PERSON', 'NATION_MAN': 'GPE', 'NATION_WOMAN': 'GPE', 'NATION_PLURAL': 'GPE', 'NATIONALITY': 'GPE', 'GPE': 'GPE', 'ORG': 'ORG'}
Confusion matrix:
Counter({('O', 'O'): 4793, ('PERSON', 'PERSON'): 240, ('GPE', 'O'): 202, ('ORG', 'ORG'): 52, ('ORG', 'O'): 45, ('GPE', 'GPE'): 37, ('GPE', 'PERSON'): 33, ('GPE', 'ORG'): 32, ('O', 'ORG'): 28, ('PERSON', 'ORG'): 22, ('ORG', 'PERSON'): 20, ('PERSON', 'O'): 13, ('PERSON', 'GPE'): 6, ('O', 'GPE'): 5, ('O', 'PERSON'): 4, ('ORG', 'GPE'): 2})
Precision and recall
                        Entity                     Precision                        Recall
                        PERSON                        85.41%                        80.81%
                           GPE                        12.17%         

Evaluating <class 'presidio_evaluator.spacy_evaluator.SpacyEvaluator'>: 100%|██████████| 300/300 [00:05<00:00, 57.59it/s]


Custom evaluation

In [None]:
#evaluate custom sentences
#nlp = spacy.load(spacy_ft_100)


### Results analysis

In [23]:
#sent = input("Enter sentence: ")
sent = 'David is talking loudly'
doc = nlp(sent)
for ent in doc.ents:
    print("Entity = {} value = {}".format(ent.label_,ent.text))

Entity = PERSON value = David


#### False positives

1. Most false positive tokens:

In [24]:
ModelEvaluator.most_common_fp_tokens(errors)#[model_error for model_error in errors if model_error.error_type =='FP']

Most common false positive tokens:
[('Entertainment', 6), ('Weekly', 6), ('Texas', 4), ('the', 2), ('Fuse', 2), ('TV', 2), ("'s", 2), ('IBAN', 2), ('RR', 2), ('Zoolander', 1)]
Example sentence with each FP token:
Kotoya Negishi listed his top 20 songs for Entertainment Weekly and had the balls to list this song at #15. (What did he put at #1 you ask? Answer:"Tube Snake Boogie" by Frank Strauser â€“ go figure)
Kotoya Negishi listed his top 20 songs for Entertainment Weekly and had the balls to list this song at #15. (What did he put at #1 you ask? Answer:"Tube Snake Boogie" by Frank Strauser â€“ go figure)
Celebrating its 10th year in Villa de Ves, Trak Auto is a 501(c)3 that invites songwriters from around the world to Texas to share the universal language of music in collaborations designed to bridge cultures, build friendships and cultivate peace.
mission statement: this non-profit founded by radio executives "serves as an advocate for the value of music" and "supports its songwriter

In [25]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity=['PERSON'])
fps_df[['full_text','token','prediction']]

Unnamed: 0,full_text,token,prediction
0,"When they weren't singing about Hobbits, satan...",'s,PERSON
1,"When they weren't singing about Hobbits, satan...",'s,PERSON
2,zoolander is a 2001 american action-comedy fil...,starring,PERSON
3,This song by ex-Zombie Mazzi is a perfect exam...,Zombie,PERSON


2. False negative examples

In [26]:
errors = scores.model_errors
ModelEvaluator.most_common_fn_tokens(errors,n=50, entity=['PERSON'])

[('Szemere', 2), ('Avtorhan', 1), ('Riitta', 1), ('Herceg', 1), ('Tiegan', 1), ('Hill', 1), ('Liviana', 1), ('Palerma', 1), ('Nusa', 1), ('Weress', 1), ('Klimek', 1), ('Bárður', 1), ('Victoria', 1), ('Charlotte', 1), ('Park', 1), ('Nestor', 1), ('Bethany', 1), ('tryggvadóttir', 1), ('Catalina', 1), ('searlait', 1), ('Annear', 1), ('Mie', 1), ('Spartacus', 1), ('Innocent', 1), ('Houžvičková', 1), ('MacMahon', 1), ('Jaroslav', 1), ('Čechová', 1), ('wacława', 1), ('sobczak', 1), ('Pratt', 1), ('Hutinović', 1), ('Louelle', 1), ('van', 1), ('den', 1), ('Brandhof', 1), ('Szakács', 1), ('Signe', 1), ('joar', 1), ('sandberg', 1)]
Token: Szemere, Annotation: PERSON, Full text: You can tell Szemere was a huge Szemere Szakács fan. Written when he was only 14.
Token: Avtorhan, Annotation: PERSON, Full text: Avtorhan, can I please speak to your boss?
Token: Riitta, Annotation: PERSON, Full text: The true gender of Riitta has been under debate for years, but the riff and building energy is a rock ma

More FN analysis

In [27]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity=['GPE'])

In [28]:
fns_df[['full_text','token','annotation','prediction']]

Unnamed: 0,full_text,token,annotation,prediction
0,The address of Coon Chicken Inn is ul. Zuchów ...,ul,GPE,O
1,The address of Coon Chicken Inn is ul. Zuchów ...,.,GPE,O
2,The address of Coon Chicken Inn is ul. Zuchów ...,Zuchów,GPE,PERSON
3,The address of Coon Chicken Inn is ul. Zuchów ...,65,GPE,O
4,The address of Coon Chicken Inn is ul. Zuchów ...,",",GPE,O
...,...,...,...,...
262,May I request to have the statement sent to 47...,Moatsou,GPE,O
263,May I request to have the statement sent to 47...,Street,GPE,O
264,May I request to have the statement sent to 47...,",",GPE,O
265,May I request to have the statement sent to 47...,Trozaina,GPE,O


In [29]:
[print(error,"\n") for error in errors]

type: FP, Annotation = O, prediction = ORG, Token = Entertainment, Full text = Kotoya Negishi listed his top 20 songs for Entertainment Weekly and had the balls to list this song at #15. (What did he put at #1 you ask? Answer:"Tube Snake Boogie" by Frank Strauser â€“ go figure), Metadata = {'Gender': 'male', 'NameSet': 'American', 'Country': 'France', 'Lowercase': False, 'Template#': 103} 

type: FP, Annotation = O, prediction = ORG, Token = Weekly, Full text = Kotoya Negishi listed his top 20 songs for Entertainment Weekly and had the balls to list this song at #15. (What did he put at #1 you ask? Answer:"Tube Snake Boogie" by Frank Strauser â€“ go figure), Metadata = {'Gender': 'male', 'NameSet': 'American', 'Country': 'France', 'Lowercase': False, 'Template#': 103} 

type: FP, Annotation = O, prediction = GPE, Token = Texas, Full text = Celebrating its 10th year in Villa de Ves, Trak Auto is a 501(c)3 that invites songwriters from around the world to Texas to share the universal lan

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,