In [105]:
import collections

import datasets

from stanza.models.constituency.tree_reader import read_trees
from stanza.models.constituency.parse_tree import Tree

In [2]:
conll_data = datasets.load_dataset("coref-data/conll2012_indiscrim", "english_v4")

Downloading readme:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2802 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/343 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/348 [00:00<?, ? examples/s]

In [115]:
conll_data["train"].features

{'sentences': [{'id': Value(dtype='int64', id=None),
   'misc': {'parse_tree': Value(dtype='string', id=None)},
   'speaker': Value(dtype='string', id=None),
   'text': Value(dtype='string', id=None),
   'tokens': [{'deprel': Value(dtype='string', id=None),
     'head': Value(dtype='int64', id=None),
     'id': Value(dtype='int64', id=None),
     'text': Value(dtype='string', id=None),
     'upos': Value(dtype='string', id=None),
     'xpos': Value(dtype='string', id=None)}]}],
 'id': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'coref_chains': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
 'genre': Value(dtype='string', id=None),
 'meta_data': {'comment': Value(dtype='string', id=None)}}

In [125]:
def get_all_noun_phrases(node: Tree):
    if node.label == "NP" or node.label == "NML":
        yield node
    if not node.is_leaf():
        for child in node.children:
            yield from get_all_noun_phrases(child)


def get_all_noun_phrase_indices(tree):
    # map words to indices and get all noun phrases
    str_indices = map(str, range(len(tree)))
    index_tree = tree.replace_words(str_indices)
    return get_all_noun_phrases(index_tree)


def get_np_mentions(sent_i, sentence):
    tokens = sentence["tokens"]
    parse_tree = sentence["misc"]["parse_tree"]

    if not parse_tree:
        return

    trees = read_trees(parse_tree)
    assert len(trees) == 1
    tree: Tree = trees[0]

    assert len(tree) == len(tokens)

    for np_node in get_all_noun_phrase_indices(tree):
        leaves = np_node.leaf_labels()
        yield [sent_i, int(leaves[0]), int(leaves[-1])]


def add_singletons(example):
    sentences = example["sentences"]
    coref_chains = example["coref_chains"]

    singleton_mentions = []
    for sent_i, sentence in enumerate(sentences):
        for m in get_np_mentions(sent_i, sentence):
            singleton_mentions.append(m)
        
        for i, t in enumerate(sentence["tokens"]):
            if t["xpos"] in ["PRP", "PRP$"] or t["xpos"][0] == "V" or t["xpos"] in ["WP", "WP$", "WDT", "WRB"] or t["xpos"][0] == "N" :
                singleton_mentions.append([sent_i, i, i])

    return {"singleton_mentions": singleton_mentions}



In [126]:
singleton_data = conll_data.map(add_singletons)

Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

In [128]:
# calculate precision and recall
for split in ["train", "validation", "test"]:
    recalled = 0
    total_coref = 0
    total_singleton = 0

    for example in singleton_data[split]:
        coref_chains = example["coref_chains"]
        singleton_mentions = example["singleton_mentions"]

        coref_mentions = set()
        for c in coref_chains:
            for m in c:
                coref_mentions.add(tuple(m))

        singleton_mentions = set([tuple(m) for m in singleton_mentions])

        sent2singletons = collections.defaultdict(list)
        for m in singleton_mentions:
            sent2singletons[m[0]].append(m)

        # for all mentions in the same sentence, find if any mentions start one word later
        for same_sent_ments in sent2singletons.values():
            for m in same_sent_ments:
                for n in same_sent_ments:
                    if m[2] == n[1] - 1 or m[2] == n[1] - 2:
                        singleton_mentions.add((m[0], m[1], n[2]))

        total_coref += len(coref_mentions)
        total_singleton += len(singleton_mentions)

        recalled += len(singleton_mentions.intersection(coref_mentions))
        
    print(split)
    print(recalled, total_coref, total_singleton, float(recalled) / total_coref, float(recalled) / total_singleton)
    # what percent of coref are in singleton? what percent of singleton are in coref?

        

train
155225 155558 1865019 0.997859319353553 0.08322971508601253
validation
19083 19155 233835 0.9962411902897416 0.08160882673680159
test
19709 19764 242523 0.997217162517709 0.08126651905180127


In [None]:
# calculate singleton mentions using parse

# All noun phrases with distinct headwords are extracted from previously treebanked data
# Whenever head-sharing NPs are nested, the largest logical span is used in co-reference (see 2.4.1).
## Head-sharing NPs are two (or more) extracted entities, the shorter one(s) contained within the span of the longer,
## sharing the same content word as their headword.
## In such cases, the longest logical span should be used in co-reference with other mentions. 

# Possessive proper nouns (Fred's) are extracted from the treebanked data;
# however, possessive pronouns (his) must be manually extracted by the annotator and added to the list of mentions

# Proper noun PreMods can be co-referenced to existing noun phrases and/or other proper PreMods,
# and should be manually extracted by the annotator and added to the list of mentions.
# Non-proper and adjectival premodifiers are not eligible for co-reference (see 2.3).
## Premodifiers must be proper nouns
## Pre-modifying dates and monetary amounts are also eligible for co-reference
## Acronymic premodifiers should be co-referenced unless they refer to nationality

# Only the single-word head of the verb phrase is included in the span,
# even in cases where the entire verb phrase is the logical co-referent.

# Appositives: only the whole span is linked for IDENT

# Partitives: all and both can corefer in nesting

# in, at, to, from cannot be metonyms

