In [3]:
import datasets

In [6]:
dataset = datasets.load_dataset("coref-data/winogrande_coref")

2

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data'],
        num_rows: 36034
    })
    validation: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data'],
        num_rows: 1117
    })
})

Filter out examples where there isn't a mention that is a pronoun (or "they" occurs multiple times)

In [22]:
from functools import partial

def mention_to_str(sentences, mention):
    sent, start, end = mention
    tokens = sentences[sent]["tokens"][start:end + 1]
    words = map(lambda x: x["text"], tokens)
    return " ".join(words).lower()

def is_pronominal(ex):
    sentences = ex["sentences"]
    coref_chains = ex["coref_chains"]

    assert len(coref_chains) == 2
    coref_chains.sort(key=lambda x: len(x))
    assert len(coref_chains[0]) == 1 and len(coref_chains[1]) == 2

    long_chain = coref_chains[1]
    mention_to_str_partial = partial(mention_to_str, sentences)
    mentions = map(mention_to_str_partial, long_chain)

    return any(x in mentions for x in ["he", "she", "they"]) and ex["text"].lower().count("they") < 2

pronominal = dataset.filter(is_pronominal)

Filter: 100%|██████████| 36034/36034 [00:06<00:00, 5301.93 examples/s]
Filter: 100%|██████████| 1117/1117 [00:00<00:00, 5902.02 examples/s]


In [25]:
pronominal

DatasetDict({
    train: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data'],
        num_rows: 8710
    })
    validation: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data'],
        num_rows: 249
    })
})

In [31]:
winogrande_pronominal = pronominal["train"].train_test_split(test_size=len(pronominal["validation"]))

In [32]:
winogrande_pronominal["validation"] = winogrande_pronominal["test"]
winogrande_pronominal["test"] = pronominal["validation"]

DatasetDict({
    train: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data'],
        num_rows: 8461
    })
    test: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data'],
        num_rows: 249
    })
})

In [33]:
winogrande_pronominal.push_to_hub("coref-data/pronominal_winogrande")

Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 20.49ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 29.37ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/coref-data/pronominal_winogrande/commit/0ee08c38dadf648f3b12fd441dd25660e377863b', commit_message='Upload dataset', commit_description='', oid='0ee08c38dadf648f3b12fd441dd25660e377863b', pr_url=None, pr_revision=None, pr_num=None)