In [1]:
from recon import Dataset
from recon.types import Example, Span
from recon.operations.core import op_registry

from recon.operations.tokenization import add_tokens

In [2]:
person_example = Example(text="My friend is named Dallas.", spans=[Span(text="Dallas", start=19, end=25, label="PERSON")])
gpe_example = Example(text="Dallas is a city in Texas.", spans=[Span(text="Dallas", start=0, end=6, label="GPE")])

In [3]:
person_example

Example: "My friend is named Dallas.", 1 span.

In [4]:
print(person_example)

Example: "My friend is named Dallas.", 1 spans.


In [5]:
ds = Dataset("DallasExamples", [person_example, gpe_example])
ds.apply_("recon.add_tokens.v1")

=> Applying operation 'recon.add_tokens.v1' to dataset 'DallasExamples'


  0%|                                                                                                  | 0/2 [00:00<?, ?it/s]

[38;5;4mℹ      => Running preprocessor recon.spacy.v1[0m



  0%|                                                                                                  | 0/2 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.21it/s][A

[38;5;2m✔ Completed operation 'recon.add_tokens.v1'[0m





In [6]:
print(ds)

Dataset
Name: DallasExamples
Stats: {
    "n_examples": 2,
    "n_examples_no_entities": 0,
    "n_annotations": 2,
    "n_annotations_per_type": {
        "PERSON": 1,
        "GPE": 1
    }
}


In [7]:
ds.data[0].show()
ds.data[1].show()

In [8]:
person_example = ds.data[0]
person_example.pretty_print()

My friend is named [38;5;16;48;5;222m Dallas [0m[38;5;16;48;5;141m PERSON [0m.


In [9]:
from recon import Corpus
corpus = Corpus.from_disk("./data/conll2003", "conll2003")

In [7]:
for example in corpus.all:
    for span in example.spans:
        if span.text.lower() == "china":
            
            print(span.label)
            
            if span.label != "LOC":
                print(example)

LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
PER
Example: "SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . ", 2 spans.
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC
LOC


In [10]:
from recon.insights import get_label_disparities


get_label_disparities(corpus.test, "PER", "LOC")

{'china', 'santiago'}

In [11]:
from typing import List, DefaultDict, Set
from collections import defaultdict

def get_ents_by_label(
    data: List[Example], case_sensitive: bool = False
) -> DefaultDict[str, List[str]]:
    """Get a dictionary of unique text spans by label for your data

    # TODO: Ok so this needs to return more than just a set for each label.

    We want to return a dictionary that maps labels to AnnotationCount objects where each
    AnnotationCount contains the text of the annotation text, the total number of times it's mentioned (e.g. what entity_coverage does)
    but also the examples it is in.

    So maybe I can get this info from entity_coverage? IDK but this is dumb rn and not very flexible.

    Maybe I should keep this function returning a set of strings for each label for compatability but I need the other way too
    so I know what to focus on in editing and analyzing

    Args:
        data (List[Example]): List of examples
        case_sensitive (bool, optional): Consider case of text for each annotation

    Returns:
        DefaultDict[str, List[str]]: DefaultDict mapping label to sorted list of the unique
            spans annotated for that label.
    """
    annotations: DefaultDict[str, DefaultDict[Set[Example]]] = defaultdict(lambda: defaultdict(set))

    for example in data:
        for s in example.spans:
            span_text = s.text if case_sensitive else s.text.lower()
            annotations[s.label][span_text].add(example)
            
    return annotations


def get_label_disparities(
    data: List[Example], label1: str, label2: str, case_sensitive: bool = False
) -> Set[str]:
    """Identify annotated spans that have different labels in different examples

    Args:
        data (List[Example]): Input List of examples
        label1 (str): First label to compare
        label2 (str): Second label to compare
        case_sensitive (bool, optional): Consider case of text for each annotation

    Returns:
        Set[str]: Set of all unique text spans that overlap between label1 and label2
    """
    annotations = get_ents_by_label(data, case_sensitive=case_sensitive)
    overlap = set(annotations[label1]).intersection(set(annotations[label2]))
    
    output = defaultdict(list)
    for ann in overlap:
        if ann in annotations[label1]:
            output[ann] += annotations[label1][ann]
        if ann in annotations[label2]:
            output[ann] += annotations[label2][ann]
            
    return output
            

In [12]:
ebl = get_ents_by_label(corpus.test)

In [13]:
ebl["PER"]["china"]

{Example: "SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . ", 2 spans.}

In [18]:
for e in get_label_disparities(corpus.test, "PER", "LOC")["china"]:
    e.pretty_print()
    print("-" * 100)

SOCCER - [38;5;16;48;5;222m JAPAN [0m[38;5;16;48;5;141m LOC [0m GET LUCKY WIN , [38;5;16;48;5;222m CHINA [0m[38;5;16;48;5;141m PER [0m IN SURPRISE DEFEAT . 
----------------------------------------------------------------------------------------------------
[38;5;16;48;5;222m China [0m[38;5;16;48;5;141m LOC [0m 's [38;5;16;48;5;222m State Council [0m[38;5;16;48;5;141m ORG [0m , or cabinet , has given a port in the southern province of [38;5;16;48;5;222m Hainan [0m[38;5;16;48;5;141m LOC [0m permission to open to foreign vessels , the [38;5;16;48;5;222m Xinhua [0m[38;5;16;48;5;141m ORG [0m news agency said on Friday . 
----------------------------------------------------------------------------------------------------
Mana 50,000 tonnes soybeans USG / [38;5;16;48;5;222m China [0m[38;5;16;48;5;141m LOC [0m 10-15/12 $ 23.50 10,000 / 4,000 GeePee . 
----------------------------------------------------------------------------------------------------
[38;5;16;48;5

In [23]:
from recon.insights import top_label_disparities


top_label_disparities(corpus.test)

[LabelDisparity(label1='LOC', label2='ORG', count=61, examples=[]),
 LabelDisparity(label1='ORG', label2='LOC', count=61, examples=[]),
 LabelDisparity(label1='LOC', label2='PER', count=2, examples=[]),
 LabelDisparity(label1='LOC', label2='MISC', count=2, examples=[]),
 LabelDisparity(label1='PER', label2='LOC', count=2, examples=[]),
 LabelDisparity(label1='PER', label2='ORG', count=2, examples=[]),
 LabelDisparity(label1='MISC', label2='LOC', count=2, examples=[]),
 LabelDisparity(label1='ORG', label2='PER', count=2, examples=[]),
 LabelDisparity(label1='PER', label2='MISC', count=1, examples=[]),
 LabelDisparity(label1='MISC', label2='PER', count=1, examples=[]),
 LabelDisparity(label1='MISC', label2='ORG', count=1, examples=[]),
 LabelDisparity(label1='ORG', label2='MISC', count=1, examples=[])]

In [32]:
for e in get_label_disparities(corpus.test, "LOC", "ORG")["utah"]:
    e.pretty_print()
    print()

[38;5;16;48;5;222m MINNESOTA [0m[38;5;16;48;5;141m ORG [0m AT [38;5;16;48;5;222m UTAH [0m[38;5;16;48;5;141m LOC [0m 

[38;5;16;48;5;222m UTAH [0m[38;5;16;48;5;141m ORG [0m AT [38;5;16;48;5;222m DENVER [0m[38;5;16;48;5;141m LOC [0m 

[38;5;16;48;5;222m UTAH [0m[38;5;16;48;5;141m ORG [0m 15 2 .882 1/2 

[38;5;16;48;5;222m UTAH [0m[38;5;16;48;5;141m ORG [0m 106 [38;5;16;48;5;222m Minnesota [0m[38;5;16;48;5;141m ORG [0m 95 

[38;5;16;48;5;222m UTAH [0m[38;5;16;48;5;141m ORG [0m 14 2 .875 1 



In [38]:
get_label_disparities(corpus.train, "LOC", "ORG").keys()

dict_keys(['lausanne', 'wimbledon', 'st helens', 'auckland', 'brisbane', 'antwerp', 'zurich', 'southampton', 'essex', 'leicester', 'milan', 'nice', 'manchester united', 'wall street', 'doetinchem', 'bremen', 'montreal', 'taibe', 'america', 'charleroi', 'heerenveen', 'genoa', 'verona', 'cologne', 'new york', 'seattle', 'hamburg', 'lincoln', 'detroit', 'minnesota', 'monaco', 'colorado', 'sierra', 'preston', 'bristol', 'chesterfield', 'texas', 'philadelphia', 'akron', 'london', 'cardiff', 'griqualand west', 'lisbon', 'tel aviv', 'chicago', 'boston', 'california', 'jordan', 'kansas city', 'sydney', 'sao paulo', 'porto', 'leeds', 'ulsan', 'atlanta', 'cambridge', 'pa', 'york', 'birmingham', 'san diego', 'toronto', 'milwaukee', 'canberra', 'los angeles', 'paramount', 'florida', 'pittsburgh', 'paris', 'ajaccio', 'stuttgart', 'houston', 'st louis', 'nottingham forest', 'inverness', 'oxford', 'san francisco', 'south', 'bordeaux', 'barcelona', 'colchester', 'portsmouth', 'northampton', 'hampshire

In [45]:
for e in get_label_disparities(corpus.train, "LOC", "ORG")["cleveland"]:
    e.pretty_print()
    print()

[38;5;16;48;5;222m MILWAUKEE [0m[38;5;16;48;5;141m ORG [0m AT [38;5;16;48;5;222m CLEVELAND [0m[38;5;16;48;5;141m LOC [0m 

[38;5;16;48;5;222m CLEVELAND [0m[38;5;16;48;5;141m LOC [0m 1996-08-22 

[38;5;16;48;5;222m CLEVELAND [0m[38;5;16;48;5;141m LOC [0m 1996-08-26 

[38;5;16;48;5;222m CLEVELAND [0m[38;5;16;48;5;141m ORG [0m 8 [38;5;16;48;5;222m Milwaukee [0m[38;5;16;48;5;141m ORG [0m 5 

[38;5;16;48;5;222m Milwaukee [0m[38;5;16;48;5;141m ORG [0m 6 [38;5;16;48;5;222m CLEVELAND [0m[38;5;16;48;5;141m ORG [0m 5 ( 11 innings ) 

[38;5;16;48;5;222m Cleveland [0m[38;5;16;48;5;141m ORG [0m lost for just the second time in six games . 

[38;5;16;48;5;222m CLEVELAND [0m[38;5;16;48;5;141m ORG [0m 76 51 .598 - 

In [38;5;16;48;5;222m Cleveland [0m[38;5;16;48;5;141m ORG [0m , [38;5;16;48;5;222m Kevin Seitzer [0m[38;5;16;48;5;141m PER [0m 's two-out single in the top of the 10th brought home [38;5;16;48;5;222m David Hulse [0m[38;5;16;48;5;141m PER 