In [None]:
!pip install datasets

In [2]:
from datasets import load_dataset

In [3]:
conll2003 = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /Users/kabirkhan/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /Users/kabirkhan/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
import spacy
from spacy.tokens import Doc, Span as SpacySpan
from spacy.training.iob_utils import offsets_from_biluo_tags, spans_from_biluo_tags

In [9]:
nlp = spacy.blank("es")

In [10]:
conll_labels = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]

In [11]:
def tags_to_entities(tags):
    entities = []
    prev_tag = "O"
    start = None
    for i, tag in enumerate(tags):
        print(tag)
        if tag is None:
            continue
        if tag.startswith("O"):
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
            if start is not None:
                start = None
            continue
#         elif tag == "-":
#             continue
#         elif tag.startswith("I"):
#             if start is None:
#                 raise ValueError(Errors.E067.format(tags=tags[:i + 1]))
#             continue
        if tag.startswith("B"):
            start = i
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag:
            continue
        elif prev_tag != "O" and tag == "O":
            entities.append((tag[2:], start, i))
            start = None
#         elif tag.startswith("L"):
#             entities.append((tag[2:], start, i))
#             start = None
        else:
            raise ValueError(Errors.E068.format(tag=tag))
        prev_tag = tag
    return entities


In [12]:
def tags_to_entities(tags):
    entities = []
    start = None
    prev_tag = "O"
    for i, tag in enumerate(tags):
        if tag is None:
            continue
        if tag.startswith("I"):
            if start is None:
                if i == 0:
                    start = 0
                else:
                    raise
            continue
        elif tag.startswith("B"):
            start = i
        elif (prev_tag.startswith("I") or prev_tag.startswith("B")) and tag != "I":
            entities.append((prev_tag[2:], start, i - 1))
            start = None
        else:
            start = None
        prev_tag = tag
        
    if start:
        entities.append((tag[2:], start, i))
    return entities

In [13]:
def spans_from_bio_tags(doc, tags):
    """Encode per-token tags following the BILUO scheme into Span object, e.g.
    to overwrite the doc.ents.
    doc (Doc): The document that the BILUO tags refer to.
    entities (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tags string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of Span objects.
    """
    token_offsets = tags_to_entities(tags)
    spans = []
    for label, start_idx, end_idx in token_offsets:
        span = SpacySpan(doc, start_idx, end_idx + 1, label=label)
        spans.append(span)
    return spans

def offsets_from_bio_tags(doc, tags):
    """Encode per-token tags following the BILUO scheme into entity offsets.
    doc (Doc): The document that the BILUO tags refer to.
    entities (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tags string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
        `end` will be character-offset integers denoting the slice into the
        original string.
    """
    spans = spans_from_bio_tags(doc, tags)
    return [(span.start_char, span.end_char, span.label_) for span in spans]


In [14]:
from recon.types import Example, Span

In [15]:
conll2003.keys()

dict_keys(['train', 'validation', 'test'])

In [16]:
def make_recon_examples(dataset, labels=None, labels_property = "ner_tags"):

    examples = []

    for i, e in enumerate(dataset):
        doc = Doc(nlp.vocab, words=e["tokens"], spaces=[True] * len(e["tokens"]))
        
        if labels:
            tags = [labels[tag_n] for tag_n in e[labels_property]]
        else:
            tags = e[labels_property]
        try:
            offsets = offsets_from_bio_tags(doc, tags)
        except Exception as e:
            print(e)
            print(f"ERROR AT INDEX {i}")
            print(tags)


        spans = [Span(text=doc.text[e[0]:e[1]], start=e[0], end=e[1], label=e[2]) for e in offsets]
        examples.append(Example(text=doc.text, spans=spans))

    return examples

In [17]:
train = make_recon_examples(conll2003["train"], conll_labels)
dev = make_recon_examples(conll2003["validation"], conll_labels)
test = make_recon_examples(conll2003["test"], conll_labels)

In [32]:
from recon.corpus import Corpus
from recon.dataset import Dataset

In [33]:
conll2003_corpus = Corpus(Dataset("train", train), Dataset("dev", dev), Dataset("test", test))

In [34]:
from recon import get_ner_stats, get_entity_coverage

In [35]:
ec = get_entity_coverage(conll2003_corpus.all)

ec[:10]

[EntityCoverage(text='u.s.', label='LOC', count=154, examples=[]),
 EntityCoverage(text='germany', label='LOC', count=97, examples=[]),
 EntityCoverage(text='london', label='LOC', count=82, examples=[]),
 EntityCoverage(text='australia', label='LOC', count=80, examples=[]),
 EntityCoverage(text='france', label='LOC', count=80, examples=[]),
 EntityCoverage(text='russia', label='LOC', count=79, examples=[]),
 EntityCoverage(text='world cup', label='MISC', count=77, examples=[]),
 EntityCoverage(text='italy', label='LOC', count=64, examples=[]),
 EntityCoverage(text='china', label='LOC', count=58, examples=[]),
 EntityCoverage(text='england', label='LOC', count=57, examples=[])]

In [24]:
per_ec = [e for e in ec if e.label == "PER"]

per_ec[:10]

[EntityCoverage(text='clinton', label='PER', count=23, examples=[]),
 EntityCoverage(text='yeltsin', label='PER', count=19, examples=[]),
 EntityCoverage(text='wang', label='PER', count=19, examples=[]),
 EntityCoverage(text='lebed', label='PER', count=17, examples=[]),
 EntityCoverage(text='arafat', label='PER', count=14, examples=[]),
 EntityCoverage(text='suu kyi', label='PER', count=14, examples=[]),
 EntityCoverage(text='edberg', label='PER', count=13, examples=[]),
 EntityCoverage(text='albright', label='PER', count=13, examples=[]),
 EntityCoverage(text='lara', label='PER', count=12, examples=[]),
 EntityCoverage(text='dole', label='PER', count=11, examples=[])]

In [25]:
len(per_ec)

2223

In [26]:
for name, stats in conll2003_corpus.apply(get_ner_stats, serialize=True).items():
    print(name)
    print(stats)

train
{
    "n_examples":3251,
    "n_examples_no_entities":702,
    "n_annotations":5842,
    "n_annotations_per_type":{
        "LOC":1826,
        "PER":1816,
        "ORG":1332,
        "MISC":868
    },
    "examples_with_type":null
}
dev
{
    "n_examples":3454,
    "n_examples_no_entities":738,
    "n_annotations":5553,
    "n_annotations_per_type":{
        "LOC":1651,
        "ORG":1644,
        "PER":1591,
        "MISC":667
    },
    "examples_with_type":null
}
test
{
    "n_examples":0,
    "n_examples_no_entities":0,
    "n_annotations":0,
    "n_annotations_per_type":{

    },
    "examples_with_type":null
}
all
{
    "n_examples":6705,
    "n_examples_no_entities":1440,
    "n_annotations":11395,
    "n_annotations_per_type":{
        "LOC":3477,
        "PER":3407,
        "ORG":2976,
        "MISC":1535
    },
    "examples_with_type":null
}


In [27]:
from recon.insights import get_label_disparities

In [29]:
get_label_disparities(conll2003_corpus.test, "PER", "LOC")

set()

In [30]:
conll2003_corpus.test

[]