In [10]:
import datasets

from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

In [11]:
DATASETS = [
    ("coref-data/conll2012_indiscrim", "english_v4"),
    "coref-data/preco_indiscrim",
    "coref-data/phrase_detectives_indiscrim",
    ("coref-data/litbank_indiscrim", "split_0"),
    ("coref-data/gum_indiscrim", "ontogum"),
    "coref-data/arrau_indiscrim",
    ("coref-data/mmc_indiscrim", "mmc_en"),
]

In [12]:
def get_stats(example):
    sentences = example["sentences"]
    coref_chains = example["coref_chains"]

    sentence_lens = [len(s["tokens"]) for s in sentences]
    non_singleton_chains = [c for c in coref_chains if len(c) > 1]

    num_sentences = len(sentences)
    num_tokens = sum(sentence_lens)
    num_chains = len(coref_chains)
    num_ments = sum([len(c) for c in coref_chains])
    num_singletons = len([c for c in coref_chains if len(c) == 1])
    num_nonsingleton_chains = len(non_singleton_chains)
    num_nonsingleton_ments = sum([len(c) for c in non_singleton_chains])

    return {
        "num_sentences": num_sentences,
        "num_tokens": num_tokens,
        "num_chains": num_chains,
        "num_ments": num_ments,
        "num_singletons": num_singletons,
        "num_nonsingleton_chains": num_nonsingleton_chains,
        "num_nonsingleton_ments": num_nonsingleton_ments,
    }



In [13]:
from statistics import mean


for dataset_name in DATASETS:
    if type(dataset_name) is tuple:
        name, config = dataset_name
        dataset = datasets.load_dataset(name, config)
    else:
        dataset = datasets.load_dataset(dataset_name)

    print("\n" + "="*10)
    print(dataset_name)
    for split, data in dataset.items():
        print(split)
        print("Total examples:", len(data["id"]))

        stats = data.map(get_stats, remove_columns=data.column_names, num_proc=4)

        print("Avg num sentences:",
              mean([x["num_sentences"] for x in stats])
              )
        
        print("Avg num tokens:",
              mean([x["num_tokens"] for x in stats])
              )
        
        print("Avg num chains:",
              mean([x["num_chains"] for x in stats])
              )
        
        print("Avg num mentions:",
              mean([x["num_ments"] for x in stats])
              )
        
        print("Avg num singletons:",
              mean([x["num_singletons"] for x in stats])
              )
        
        print("Avg num non-singleton chains:",
              mean([x["num_nonsingleton_chains"] for x in stats])
              )
        
        print("Avg num non-singleton mentions:",
              mean([x["num_nonsingleton_ments"] for x in stats])
              )
            
        print("="*10)


('coref-data/conll2012_indiscrim', 'english_v4')
train
Total examples: 2802
Avg num sentences: 26.833333333333332
Avg num tokens: 463.7087794432548
Avg num chains: 12.542112776588151
Avg num mentions: 55.51713062098501
Avg num singletons: 0
Avg num non-singleton chains: 12.542112776588151
Avg num non-singleton mentions: 55.51713062098501
validation
Total examples: 343
Avg num sentences: 27.997084548104958
Avg num tokens: 475.52186588921285
Avg num chains: 13.253644314868804
Avg num mentions: 55.84839650145773
Avg num singletons: 0
Avg num non-singleton chains: 13.253644314868804
Avg num non-singleton mentions: 55.84839650145773
test
Total examples: 348
Avg num sentences: 27.238505747126435
Avg num tokens: 487.29597701149424
Avg num chains: 13.022988505747126
Avg num mentions: 56.793103448275865
Avg num singletons: 0
Avg num non-singleton chains: 13.022988505747126
Avg num non-singleton mentions: 56.793103448275865

coref-data/preco_indiscrim
train
Total examples: 36120
Avg num sentenc