In [None]:
!pip install ../../CognitiveServices/API-TextAnalytics-NER.CloudServices/models/supplemental/en_ta_lg-2.2.5.tar.gz

In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
import en_ta_lg
import copy
from typing import Dict, List
import spacy
from recon import Dataset
from recon.constants import NONE
from recon.types import Example
from recon.stats import ner_stats
from recon.insights import ents_by_label, get_label_disparities, top_prediction_errors
from recon.recognizer import SpacyEntityRecognizer

In [3]:
ds = Dataset.from_disk('../../CognitiveServices/API-TextAnalytics-NER.CloudServices/data/2020-02-24/')

In [4]:
ds.apply(ner_stats, serialize=True)

{
    "n_examples":130000,
    "n_examples_no_entities":7,
    "ents_per_type":{
        "AGE":2975,
        "LOCATION":52207,
        "PERSONTYPE":68950,
        "SET":1493,
        "GPE":82970,
        "DATERANGE":41853,
        "EVENT":20008,
        "NUMBER":18465,
        "ORGANIZATION":56205,
        "PERSON":92446,
        "PRODUCT":17997,
        "DATE":13169,
        "PERCENTAGE":3688,
        "DURATION":8537,
        "ADDRESS":2914,
        "CURRENCY":4336,
        "TIME":734,
        "TIMERANGE":650,
        "DATETIMERANGE":644,
        "NUM_RANGE":317
    }
}
{
    "n_examples":10000,
    "n_examples_no_entities":0,
    "ents_per_type":{
        "EVENT":1553,
        "PERSON":7044,
        "GPE":6531,
        "LOCATION":4047,
        "PERSONTYPE":5230,
        "ORGANIZATION":4316,
        "NUMBER":1401,
        "PRODUCT":1333,
        "DATERANGE":3200,
        "TIME":59,
        "DATE":1054,
        "DURATION":618,
        "AGE":232,
        "TIMERANGE":65,
        "CURRENC

{'train': None, 'dev': None, 'test': None, 'all': None}

In [68]:
def top_label_disparities(data, use_lower=True):
    annotations = ents_by_label(data, use_lower=use_lower)
    label_disparities = {}
    for label1 in annotations.keys():
        for label2 in annotations.keys():
            if label1 != label2:
                n_disparities = len(set(annotations[label1]).intersection(set(annotations[label2])))
                if n_disparities > 0:
                    sorted_labels = sorted([label1, label2])
                    input_hash = '||'.join(sorted_labels)
                    label_disparities[input_hash] = {
                        'label1': sorted_labels[0],
                        'label2': sorted_labels[1],
                        'n_disparities': n_disparities
                    }
            
    return sorted(label_disparities.values(), key=lambda row: row['n_disparities'], reverse=True)

In [69]:
top_label_disparities(ds.all)

[{'label1': 'GPE', 'label2': 'LOC', 'n_disparities': 2803},
 {'label1': 'GPE', 'label2': 'PERSON', 'n_disparities': 1652},
 {'label1': 'LOC', 'label2': 'ORG', 'n_disparities': 1195},
 {'label1': 'ORG', 'label2': 'PERSON', 'n_disparities': 1089},
 {'label1': 'GPE', 'label2': 'ORG', 'n_disparities': 1005},
 {'label1': 'LOC', 'label2': 'PERSON', 'n_disparities': 619},
 {'label1': 'PERSON', 'label2': 'PERSONTYPE', 'n_disparities': 600},
 {'label1': 'ORG', 'label2': 'PERSONTYPE', 'n_disparities': 564},
 {'label1': 'ORG', 'label2': 'PRODUCT', 'n_disparities': 546},
 {'label1': 'DATE', 'label2': 'DATERANGE', 'n_disparities': 481},
 {'label1': 'DATERANGE', 'label2': 'DURATION', 'n_disparities': 388},
 {'label1': 'PERSON', 'label2': 'PRODUCT', 'n_disparities': 366},
 {'label1': 'GPE', 'label2': 'PERSONTYPE', 'n_disparities': 354},
 {'label1': 'LOC', 'label2': 'PRODUCT', 'n_disparities': 325},
 {'label1': 'EVENT', 'label2': 'ORG', 'n_disparities': 288},
 {'label1': 'DATERANGE', 'label2': 'NUMBER

In [70]:
get_label_disparities(ds.all, 'DATERANGE', 'GPE')

{'1150',
 '1402',
 '1582',
 '1815',
 '1825',
 '1861',
 '1863',
 '1882',
 '1899',
 '1903',
 '1904',
 '1925',
 '1929',
 '1949',
 '1955',
 '1960',
 '1966',
 '1972',
 '1987',
 '1991',
 '1992',
 '1993',
 '1996',
 '2006',
 '41',
 'achaemenid empire',
 'and',
 'april',
 'august',
 'australia',
 'byzantine',
 'edo',
 'europe',
 'guilford',
 'holiday',
 'indus valley civilisation',
 'mexico',
 'nara',
 'new york',
 'ottoman empire',
 'puranas',
 'roman empire',
 'shanghai',
 'song',
 'sydney',
 'twentieth century',
 'united states',
 'weeks'}

In [8]:
def rename_labels(data: List[Example], label_map: Dict[str, str]):
    data_copy = copy.deepcopy(data)
    for example in data_copy:
        for span in example.spans:
            span.label = label_map.get(span.label, span.label)
    return data_copy
            
label_map = {
    'PERSONNAME': 'PERSON',
    'ORGANIZATION': 'ORG',
    'LOCATION': 'LOC',
    'PERCENTAGE': 'PERCENT',
    'CURRENCY': 'MONEY'
}
ds.apply_(map_labels, label_map)

In [9]:
nlp = en_ta_lg.load()
recognizer = SpacyEntityRecognizer(nlp)

In [71]:
top_errors = top_prediction_errors(recognizer, ds.test, n=1000, labels=["DATERANGE"], verbose=True)

In [72]:
len(top_errors)

1027

In [73]:
top_errors[0]

PredictionError(text='one', true_label='NUMBER', pred_label='__NONE__', count=21, examples=[Example(text='He\'s gotten letters from people commenting on the site, such as one from a local college professor who was teaching in Moscow, who wrote that "it was nice and reassuring to be able to pull up the Channel 8 logo, like a bridge back home."', spans=[TextSpanLabel(text='one', start=64, end=67, label='NUMBER'), TextSpanLabel(text='college professor', start=81, end=98, label='PERSONTYPE'), TextSpanLabel(text='Moscow', start=119, end=125, label='GPE'), TextSpanLabel(text='Channel 8', start=196, end=205, label='ORG'), TextSpanLabel(text='bridge', start=219, end=225, label='LOC')], meta={'source': 'Cognitive Services Training Set'}), Example(text='Berry writes and designs the newsletter herself, and each issue "is usually built around one article of current interest to working mothers," she said.', spans=[TextSpanLabel(text='Berry', start=0, end=5, label='PERSON'), TextSpanLabel(text='one'

In [64]:
fn = [e for e in top_errors if e.pred_label == NONE]
fp = [e for e in top_errors if e.pred_label != NONE]
len(fn), len(fp)

(805, 222)

In [62]:
for e in fp[2].examples:
    print("EXAMPLE: ", e.text)
    for span in e.spans:
        if span.text == fp[2].text and span.label == fp[2].pred_label:
            print(span)
        
    print('=' * 100)
    print()

EXAMPLE:  Even while K-C was offering Huggies for Him and Huggies for Her, it had a Huggies white thick product on the market that it promoted for nighttime use.

EXAMPLE:  Even while K-C was offering Huggies for Him and Huggies for Her, it had a Huggies white thick product on the market that it promoted for nighttime use.

EXAMPLE:  P&G submitted laboratory and consumer tests and K-C provided its in-house research plus the results of external tests while the ASA commissioned independent expert advice.

EXAMPLE:  With the earlier acquisition of Zisoft Bobi a.s., a Czech Republic diaper manufacturer with manufacturing in Jaromer, Czech Republic, K-C also markets "Bobi" economy diapers in that country and in Slovakia and Huggies economy diapers in Russia, Romania, Croatia, Slovenia and the Baltic States.

EXAMPLE:  K-C's nonwovens business unit--formerly known as theNonwoven Fabrics business--changed its name last year to Kimberly Clark Partnership Products. The new name is a reflection 