* https://github.com/google-research/xtreme
* https://huggingface.co/datasets/xtreme

In [2]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names("xtreme")
print(f"len : {len(xtreme_subsets)}")

len : 183


In [3]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [4]:
from collections import defaultdict
from datasets import load_dataset, DatasetDict

langs = ["es", "ko", "en", "ja"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")

    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows)))
        )

Downloading data: 100%|██████████| 744k/744k [00:02<00:00, 312kB/s]
Downloading data: 100%|██████████| 372k/372k [00:03<00:00, 113kB/s]
Downloading data: 100%|██████████| 373k/373k [00:01<00:00, 198kB/s]
Generating train split: 100%|██████████| 20000/20000 [00:00<00:00, 585342.93 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:00<00:00, 1954657.47 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 2103567.88 examples/s]
Downloading data: 100%|██████████| 1.27M/1.27M [00:02<00:00, 620kB/s]
Downloading data: 100%|██████████| 632k/632k [00:02<00:00, 307kB/s]
Downloading data: 100%|██████████| 636k/636k [00:01<00:00, 320kB/s]
Generating train split: 100%|██████████| 20000/20000 [00:00<00:00, 2118602.85 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:00<00:00, 1579061.82 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 1963901.30 examples/s]
Downloading data: 100%|██████████| 942k/942

In [5]:
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
             index=["Number of training examples"])

Unnamed: 0,es,ko,en,ja
Number of training examples,12580,4580,1680,1180


In [7]:
element = panx_ch["es"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['Liga', 'de', 'la', 'Justicia', 'Europa']
ner_tags: [3, 4, 4, 4, 4]
langs: ['es', 'es', 'es', 'es', 'es']


In [8]:
for key, value in panx_ch["es"]["train"].features.items():
    print(f"{key} : {value}")

tokens : Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags : Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs : Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [9]:
tags = panx_ch["es"]["train"].features["ner_tags"].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [10]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_es = panx_ch["es"].map(create_tag_names)

Map: 100%|██████████| 12580/12580 [00:00<00:00, 20272.73 examples/s]
Map: 100%|██████████| 6290/6290 [00:00<00:00, 21173.73 examples/s]
Map: 100%|██████████| 6290/6290 [00:00<00:00, 21059.61 examples/s]


In [11]:
es_example = panx_es["train"][0]
pd.DataFrame([es_example["tokens"], es_example["ner_tags_str"]], ['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4
Tokens,Liga,de,la,Justicia,Europa
Tags,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG


In [15]:
from collections import Counter

split2freqs = defaultdict(Counter)

for split, dataset in panx_es.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1

pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,ORG,PER,LOC
train,4581,5264,5797
validation,2415,2482,2856
test,2250,2525,2906
