First, filter examples to single atecedent, single distractor.



For each example, convert to a minimal format including:
    1. speaker information
    2. tokens and any parse information
    3. pronoun token
    4. pronoun index
    5. antecedent tokens
    6. antecedent indices
    7. distractor tokens
    8. distractor indices
    9. prompt examples

In [10]:
config_names = [
    'conll2012_indiscrim_english_v4',
    'gum_indiscrim_ontogum',
    'arrau_indiscrim_default',
    'gap_indiscrim_default',
    'davis_pdp_indiscrim_default',
    'preco_indiscrim_default',
    'litbank_indiscrim_split_0',
    'gum_indiscrim_original',
    'phrase_detectives_indiscrim_default',
    'mmc_indiscrim_mmc_en',
    'davis_wsc_indiscrim_wsc273',
    'superglue_wsc_indiscrim_default',
    'dpr_indiscrim_default',
    'knowref_60k_indiscrim_default',
    'pronominal_winogrande_default'
]

ignore_train_split = [
    'conll2012_indiscrim_english_v4',
    'preco_indiscrim_default',
    'phrase_detectives_indiscrim_default',
    'mmc_indiscrim_mmc_en',
    'knowref_60k_indiscrim_default',
]


In [18]:
import random

def shuffle_distractors(ex):
    random.shuffle(ex["distractors"])
    return ex

In [19]:
import datasets

dataset_name = "coref-data/pronominal_coreference_resolution"
new_dataset_name = "coref-data/pcr_single_antecedent"

for config_name in config_names:
    print("Processing: ", config_name)

    if config_name in ignore_train_split:
        dataset = datasets.DatasetDict({
            "validation": datasets.load_dataset(dataset_name, config_name, split="validation"),
            "test": datasets.load_dataset(dataset_name, config_name, split="test"),
        })
    else:
        dataset = datasets.load_dataset(dataset_name, config_name)

    dataset = dataset.filter(lambda x: len(x["antecedents"]) == 1)
    
    random.seed(0)
    dataset = dataset.map(shuffle_distractors)
    
    print(dataset)
    dataset.push_to_hub(new_dataset_name, config_name)

Processing:  conll2012_indiscrim_english_v4


Map: 100%|██████████| 1536/1536 [00:00<00:00, 3887.14 examples/s]
Map: 100%|██████████| 1642/1642 [00:00<00:00, 3589.50 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 1536
    })
    test: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 1642
    })
})


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 11.15ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.30s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 10.37ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 8.24MB/s]


Processing:  gum_indiscrim_ontogum


Map: 100%|██████████| 1587/1587 [00:00<00:00, 2098.46 examples/s]
Map: 100%|██████████| 272/272 [00:00<00:00, 2192.95 examples/s]
Map: 100%|██████████| 236/236 [00:00<00:00, 2293.55 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 1587
    })
    validation: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 272
    })
    test: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 236
    })
})


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  4.64ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.94s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 12.03ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 13.36ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 25.7MB/s]


Processing:  arrau_indiscrim_default


Map: 100%|██████████| 1981/1981 [00:01<00:00, 1776.12 examples/s]
Map: 100%|██████████| 179/179 [00:00<00:00, 1280.19 examples/s]
Map: 100%|██████████| 411/411 [00:00<00:00, 2081.51 examples/s]


DatasetDict({
    train: Dataset({
        features: ['split', 'id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 1981
    })
    validation: Dataset({
        features: ['split', 'id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 179
    })
    test: Dataset({
        features: ['split', 'id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 411
    })
})


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  3.30ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.61s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 11.26ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.35s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  8.00ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.50s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 4.23MB/s]


Processing:  gap_indiscrim_default


Map: 100%|██████████| 829/829 [00:00<00:00, 8115.92 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 7648.06 examples/s]
Map: 100%|██████████| 832/832 [00:00<00:00, 8682.93 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 829
    })
    validation: Dataset({
        features: ['id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 203
    })
    test: Dataset({
        features: ['id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 832
    })
})


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 20.65ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 43.29ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 18.28ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 29.4MB/s]


Processing:  davis_pdp_indiscrim_default


Map: 100%|██████████| 33/33 [00:00<00:00, 2157.13 examples/s]


DatasetDict({
    test: Dataset({
        features: ['text', 'id', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 33
    })
})


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 181.81ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 28.5MB/s]


Processing:  preco_indiscrim_default


Map: 100%|██████████| 2167/2167 [00:00<00:00, 8049.51 examples/s]
Map: 100%|██████████| 2248/2248 [00:00<00:00, 8747.63 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['id', 'sentences', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 2167
    })
    test: Dataset({
        features: ['id', 'sentences', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 2248
    })
})


Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 38.46ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.24s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 39.58ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 36.4MB/s]


Processing:  litbank_indiscrim_split_0


Map: 100%|██████████| 2158/2158 [00:04<00:00, 454.83 examples/s]
Map: 100%|██████████| 305/305 [00:00<00:00, 448.73 examples/s]
Map: 100%|██████████| 254/254 [00:00<00:00, 377.72 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentences', 'coref_chains', 'id', 'text', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 2158
    })
    validation: Dataset({
        features: ['sentences', 'coref_chains', 'id', 'text', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 305
    })
    test: Dataset({
        features: ['sentences', 'coref_chains', 'id', 'text', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 254
    })
})


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.10s/ba]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.42s/ba]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.21s/ba]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.14s/ba]
Uploading the dataset shards: 100%|██████████| 4/4 [00:22<00:00,  5.53s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.33s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.40ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 59.8MB/s]


Processing:  gum_indiscrim_original


Map: 100%|██████████| 1632/1632 [00:00<00:00, 1819.70 examples/s]
Map: 100%|██████████| 261/261 [00:00<00:00, 2042.87 examples/s]
Map: 100%|██████████| 238/238 [00:00<00:00, 2318.27 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 1632
    })
    validation: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 261
    })
    test: Dataset({
        features: ['sentences', 'id', 'text', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 238
    })
})


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  2.71ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.40s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.62ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.54ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 39.0MB/s]


Processing:  phrase_detectives_indiscrim_default


Map: 100%|██████████| 909/909 [00:00<00:00, 1358.10 examples/s]
Map: 100%|██████████| 338/338 [00:00<00:00, 3741.72 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['sentences', 'coref_chains', 'id', 'text', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 909
    })
    test: Dataset({
        features: ['sentences', 'coref_chains', 'id', 'text', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 338
    })
})


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  3.37ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 24.34ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 40.5MB/s]


Processing:  mmc_indiscrim_mmc_en


Map: 100%|██████████| 342/342 [00:00<00:00, 3112.35 examples/s]
Map: 100%|██████████| 309/309 [00:00<00:00, 4757.66 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['sentences', 'coref_chains', 'id', 'text', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 342
    })
    test: Dataset({
        features: ['sentences', 'coref_chains', 'id', 'text', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 309
    })
})


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 14.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 20.72ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 21.4MB/s]


Processing:  davis_wsc_indiscrim_wsc273


Map: 100%|██████████| 180/180 [00:00<00:00, 5168.09 examples/s]


DatasetDict({
    test: Dataset({
        features: ['text', 'id', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 180
    })
})


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 134.90ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 12.2MB/s]


Processing:  superglue_wsc_indiscrim_default


Map: 100%|██████████| 146/146 [00:00<00:00, 4389.23 examples/s]


DatasetDict({
    test: Dataset({
        features: ['text', 'id', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 146
    })
})


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 114.64ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 29.7MB/s]


Processing:  dpr_indiscrim_default


Map: 100%|██████████| 1318/1318 [00:00<00:00, 10090.63 examples/s]
Map: 100%|██████████| 558/558 [00:00<00:00, 11668.10 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 1318
    })
    test: Dataset({
        features: ['id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 558
    })
})


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 64.74ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 70.10ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 38.2MB/s]


Processing:  knowref_60k_indiscrim_default


Map: 100%|██████████| 21240/21240 [00:01<00:00, 12584.92 examples/s]
Map: 100%|██████████| 3061/3061 [00:00<00:00, 14325.64 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 21240
    })
    test: Dataset({
        features: ['id', 'text', 'sentences', 'coref_chains', 'genre', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 3061
    })
})


Creating parquet from Arrow format: 100%|██████████| 22/22 [00:00<00:00, 191.63ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.10s/it]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 135.50ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 11.5MB/s]


Processing:  pronominal_winogrande_default


Map: 100%|██████████| 7142/7142 [00:00<00:00, 10813.94 examples/s]
Map: 100%|██████████| 209/209 [00:00<00:00, 9709.36 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 7142
    })
    test: Dataset({
        features: ['id', 'genre', 'text', 'sentences', 'coref_chains', 'meta_data', 'cluster_index', 'pronoun', 'antecedents', 'distractors', 'local_context_start', 'local_context_end', 'local_context'],
        num_rows: 209
    })
})


Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 78.15ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 106.02ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
README.md: 100%|██████████| 32.3k/32.3k [00:00<00:00, 32.5MB/s]
