In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [28]:
from collections import defaultdict, Counter
from datasets import get_dataset_config_names
from datasets import load_dataset, DatasetDict

import pandas as pd

In [4]:
# #XTREME benchmark where XTREME stands for cross-lingual transfer evaluation for multilingual encoders.
# xtreme_subsets = get_dataset_config_names('xtreme')
# print(f'XTREME has {len(xtreme_subsets)} configurations')
#
# # using PAN subset from the XTREME benchmark dataset
# PAN_subsets = [subset for subset in xtreme_subsets if subset.startswith('PAN')]

In [5]:
langs = ['de', 'fr', 'it', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]

# return a dataset dict if the key doesn't exist
panx_ch = defaultdict(DatasetDict)

In [6]:
for lang, frac in zip(langs, fracs):
    # load monolingual corpus
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}')
    for split in ds:
        panx_ch[lang][split] = (ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows))))

In [7]:
pd.DataFrame({lang: [panx_ch[lang]['train'].num_rows] for lang in langs}, index=['Number of training Examples'])

Unnamed: 0,de,fr,it,en
Number of training Examples,12580,4580,1680,1180


In [8]:
element = panx_ch['de']['train'][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [9]:
panx_ch['de']['train'].features['ner_tags'].feature

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [10]:
panx_ch['de'].set_format(type = 'pandas')
panx_ch['de']['train'][:3]
panx_ch['de'].reset_format()

Unnamed: 0,tokens,ner_tags,langs
0,"[2.000, Einwohnern, an, der, Danziger, Bucht, ...","[0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]","[de, de, de, de, de, de, de, de, de, de, de, de]"
1,"[Sie, geht, hinter, Walluf, nahtlos, in, die, ...","[0, 0, 0, 3, 0, 0, 0, 3, 4, 0, 0]","[de, de, de, de, de, de, de, de, de, de, de]"
2,"[Dirigenten, von, Weltruf, wie, Wilhelm, Furtw...","[0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, ...","[de, de, de, de, de, de, de, de, de, de, de, d..."


In [11]:
for key, value in panx_ch['de']['train'].features.items():
    print(f'{key}: {value}')

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [12]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
def tags_to_names(batch):
    return {'ner_tags_str': [tags.int2str(id) for id in batch['ner_tags']]}
    
panx_de = panx_ch['de'].map(tags_to_names)

In [13]:
panx_de.set_format(type = 'pandas')
panx_de['train'][:3]
panx_de.reset_format()

Unnamed: 0,tokens,ner_tags,langs,ner_tags_str
0,"[2.000, Einwohnern, an, der, Danziger, Bucht, ...","[0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]","[de, de, de, de, de, de, de, de, de, de, de, de]","[O, O, O, O, B-LOC, I-LOC, O, O, B-LOC, B-LOC,..."
1,"[Sie, geht, hinter, Walluf, nahtlos, in, die, ...","[0, 0, 0, 3, 0, 0, 0, 3, 4, 0, 0]","[de, de, de, de, de, de, de, de, de, de, de]","[O, O, O, B-ORG, O, O, O, B-ORG, I-ORG, O, O]"
2,"[Dirigenten, von, Weltruf, wie, Wilhelm, Furtw...","[0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, ...","[de, de, de, de, de, de, de, de, de, de, de, d...","[O, O, O, O, B-PER, I-PER, O, B-PER, I-PER, O,..."


In [18]:
de_example = panx_de['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']], index=['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [20]:
panx_de.items()

dict_items([('train', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 12580
})), ('validation', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 6290
})), ('test', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 6290
}))])

In [21]:
panx_de

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 6290
    })
})

In [39]:
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                tag_type = tag.split('-')[1]
                split2freqs[split][tag_type] +=1
pd.DataFrame.from_dict(split2freqs, orient='index')

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071
