In [2]:
!pip install datasets
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collec

In [3]:
from datasets import get_dataset_split_names, get_dataset_config_names, \
    load_dataset
from transformers import AutoTokenizer
from typing import List
from tqdm import tqdm
from statistics import pstdev, mean

In [4]:
class XNLIAnalyzer:
    def __init__(self):
        self.dataset_name = 'xnli'
        self.dataset = dict()
        self.features = [['premise', 'hypothesis'], 'label']
        self.languages = self.get_languages()
        self.split_names = get_dataset_split_names(self.dataset_name, 'all_languages')

    def get_languages(self):
        config_names = get_dataset_config_names(self.dataset_name)
        languages = [x for x in config_names if len(x) == 2]
        return languages

    def count_max_token(self, lang: str, tokenizer: AutoTokenizer) -> List[int]:
        tokens_length = dict()
        for i, split in enumerate(self.split_names):
            tokens = []
            # feats = [(x['premise'], x['hypothesis']) for x in self.dataset[lang][split]]
            for line in tqdm(self.dataset[lang][split]):
                item = (line['premise'], line['hypothesis'])
                inputs = tokenizer.encode(item, return_tensors="pt")
                tokens.append(len(inputs[0]))
            tmax = max(tokens)
            tmin = min(tokens)
            tmean = mean(tokens)
            stdev = pstdev(tokens)
            tokens_length[split] = {'max': tmax, 'min': tmin, 'mean': tmean, 'stdev': stdev}
            print(f'{lang}-{split}: {tmax} - {tmin} - {tmean} - {stdev}')
        return tokens_length

    def get_max_length(self, checkpoint: str) -> dict:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        results = {'tokenizer': checkpoint}
        for lang in self.languages:
            self.dataset[lang] = load_dataset(self.dataset_name, lang)
            results[lang] = self.count_max_token(lang, tokenizer=tokenizer)
        return results

In [5]:
xnli = XNLIAnalyzer()
checkpoints = ['bigscience/bloomz-560m', 'bigscience/bloomz-3b']
res = []
for i, check in enumerate(checkpoints):
    res.append(xnli.get_max_length(check))
    print(check)
    print(res[i])

Downloading builder script:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/36.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading and preparing dataset xnli/ar to /root/.cache/huggingface/datasets/xnli/ar/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/466M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.9M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/ar/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:04<00:00, 3145.39it/s]


ar-train: 364 - 2 - 38.96246263069707 - 19.790556012438046


100%|██████████| 5010/5010 [00:02<00:00, 2172.14it/s]


ar-test: 106 - 8 - 36.34970059880239 - 13.476595551357542


100%|██████████| 2490/2490 [00:01<00:00, 1563.33it/s]


ar-validation: 81 - 7 - 35.791967871485944 - 13.071386319139553
Downloading and preparing dataset xnli/bg to /root/.cache/huggingface/datasets/xnli/bg/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/bg/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:35<00:00, 2533.48it/s]


bg-train: 781 - 4 - 81.7038797867085 - 44.48071982281658


100%|██████████| 5010/5010 [00:01<00:00, 2795.99it/s]


bg-test: 194 - 14 - 81.79041916167665 - 30.848014329091853


100%|██████████| 2490/2490 [00:00<00:00, 2794.14it/s]


bg-validation: 226 - 15 - 80.92329317269076 - 29.801258355928205
Downloading and preparing dataset xnli/de to /root/.cache/huggingface/datasets/xnli/de/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/de/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:24<00:00, 2723.71it/s]


de-train: 592 - 2 - 57.345771093602785 - 30.320349270668505


100%|██████████| 5010/5010 [00:01<00:00, 3110.55it/s]


de-test: 129 - 10 - 54.02075848303393 - 19.536014345575502


100%|██████████| 2490/2490 [00:00<00:00, 3083.10it/s]


de-validation: 114 - 13 - 53.75903614457831 - 18.89080248025719
Downloading and preparing dataset xnli/el to /root/.cache/huggingface/datasets/xnli/el/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/el/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:44<00:00, 2383.76it/s]


el-train: 1182 - 2 - 122.14737892855142 - 66.7354068909114


100%|██████████| 5010/5010 [00:01<00:00, 2676.36it/s]


el-test: 294 - 18 - 117.19201596806387 - 44.810386236879815


100%|██████████| 2490/2490 [00:00<00:00, 2671.98it/s]


el-validation: 258 - 21 - 116.24096385542168 - 43.56449044791693
Downloading and preparing dataset xnli/en to /root/.cache/huggingface/datasets/xnli/en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:13<00:00, 2942.99it/s]


en-train: 427 - 2 - 37.306866275190856 - 19.01464986471911


100%|██████████| 5010/5010 [00:01<00:00, 3425.04it/s]


en-test: 82 - 10 - 34.41377245508982 - 11.713728146322497


100%|██████████| 2490/2490 [00:00<00:00, 3471.26it/s]


en-validation: 70 - 10 - 34.24578313253012 - 11.529699395986169
Downloading and preparing dataset xnli/es to /root/.cache/huggingface/datasets/xnli/es/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/es/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:19<00:00, 2807.33it/s]


es-train: 377 - 2 - 39.40424800484846 - 20.23989709697351


100%|██████████| 5010/5010 [00:01<00:00, 3246.75it/s]


es-test: 87 - 9 - 37.342115768463074 - 13.435865352073634


100%|██████████| 2490/2490 [00:00<00:00, 2935.27it/s]


es-validation: 81 - 8 - 37.04056224899598 - 12.875350866086714
Downloading and preparing dataset xnli/fr to /root/.cache/huggingface/datasets/xnli/fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:24<00:00, 2722.21it/s]


fr-train: 453 - 3 - 42.1538367515317 - 21.623987379816427


100%|██████████| 5010/5010 [00:01<00:00, 3098.66it/s]


fr-test: 105 - 9 - 38.849900199600796 - 13.771641469791282


100%|██████████| 2490/2490 [00:00<00:00, 3162.88it/s]


fr-validation: 80 - 9 - 38.645381526104416 - 13.320875785088122
Downloading and preparing dataset xnli/hi to /root/.cache/huggingface/datasets/xnli/hi/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/hi/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:59<00:00, 2188.01it/s]


hi-train: 571 - 3 - 61.529444718896265 - 33.729861381646714


100%|██████████| 5010/5010 [00:02<00:00, 1895.96it/s]


hi-test: 123 - 10 - 42.78562874251497 - 15.806715299415583


100%|██████████| 2490/2490 [00:00<00:00, 2550.59it/s]


hi-validation: 108 - 10 - 42.579518072289154 - 15.46184259635872
Downloading and preparing dataset xnli/ru to /root/.cache/huggingface/datasets/xnli/ru/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/ru/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:43<00:00, 2403.57it/s]


ru-train: 798 - 4 - 81.23523689718921 - 44.125446423723155


100%|██████████| 5010/5010 [00:01<00:00, 2684.18it/s]


ru-test: 194 - 11 - 79.55788423153693 - 30.34823737371231


100%|██████████| 2490/2490 [00:00<00:00, 2712.36it/s]


ru-validation: 178 - 15 - 78.71807228915662 - 29.037491944942587
Downloading and preparing dataset xnli/sw to /root/.cache/huggingface/datasets/xnli/sw/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/sw/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:13<00:00, 2950.12it/s]


sw-train: 374 - 2 - 38.81694516452679 - 20.04504993180702


100%|██████████| 5010/5010 [00:01<00:00, 3137.63it/s]


sw-test: 102 - 10 - 39.60518962075848 - 13.751393070928838


100%|██████████| 2490/2490 [00:00<00:00, 3222.34it/s]


sw-validation: 93 - 6 - 39.0578313253012 - 13.535995034053105
Downloading and preparing dataset xnli/th to /root/.cache/huggingface/datasets/xnli/th/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/th/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [03:10<00:00, 2066.01it/s]


th-train: 1760 - 5 - 158.97181832534594 - 87.25909636058833


100%|██████████| 5010/5010 [00:02<00:00, 2465.98it/s]


th-test: 363 - 26 - 141.99061876247504 - 55.129845927522624


100%|██████████| 2490/2490 [00:00<00:00, 2526.22it/s]


th-validation: 314 - 33 - 140.85863453815261 - 52.351868814939536
Downloading and preparing dataset xnli/tr to /root/.cache/huggingface/datasets/xnli/tr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/tr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:20<00:00, 2791.60it/s]


tr-train: 550 - 2 - 59.557674776293474 - 31.229906506048803


100%|██████████| 5010/5010 [00:02<00:00, 2494.67it/s]


tr-test: 156 - 10 - 62.49620758483034 - 23.071988477031056


100%|██████████| 2490/2490 [00:01<00:00, 2011.40it/s]


tr-validation: 135 - 14 - 61.31004016064257 - 21.843590119658817
Downloading and preparing dataset xnli/ur to /root/.cache/huggingface/datasets/xnli/ur/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/ur/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:20<00:00, 2803.52it/s]


ur-train: 351 - 2 - 36.837141140101146 - 18.410681392702774


100%|██████████| 5010/5010 [00:01<00:00, 2858.92it/s]


ur-test: 111 - 9 - 44.00479041916168 - 16.072228965459388


100%|██████████| 2490/2490 [00:00<00:00, 2851.39it/s]


ur-validation: 118 - 10 - 43.7004016064257 - 15.727786012365518
Downloading and preparing dataset xnli/vi to /root/.cache/huggingface/datasets/xnli/vi/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/vi/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:33<00:00, 2552.25it/s]


vi-train: 467 - 4 - 46.00200406415043 - 23.063587743769865


100%|██████████| 5010/5010 [00:02<00:00, 2436.88it/s]


vi-test: 108 - 9 - 42.73912175648702 - 14.87209628570509


100%|██████████| 2490/2490 [00:01<00:00, 1937.76it/s]


vi-validation: 107 - 11 - 42.62449799196787 - 14.82635645165467
Downloading and preparing dataset xnli/zh to /root/.cache/huggingface/datasets/xnli/zh/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/zh/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:12<00:00, 2971.44it/s]


zh-train: 470 - 2 - 45.51596121231876 - 23.76952946260383


100%|██████████| 5010/5010 [00:01<00:00, 3763.40it/s]


zh-test: 77 - 8 - 28.10059880239521 - 10.471054087021606


100%|██████████| 2490/2490 [00:00<00:00, 3854.23it/s]


zh-validation: 69 - 5 - 27.842168674698794 - 10.364423477054066
bigscience/bloomz-560m
{'tokenizer': 'bigscience/bloomz-560m', 'ar': {'train': {'max': 364, 'min': 2, 'mean': 38.96246263069707, 'stdev': 19.790556012438046}, 'test': {'max': 106, 'min': 8, 'mean': 36.34970059880239, 'stdev': 13.476595551357542}, 'validation': {'max': 81, 'min': 7, 'mean': 35.791967871485944, 'stdev': 13.071386319139553}}, 'bg': {'train': {'max': 781, 'min': 4, 'mean': 81.7038797867085, 'stdev': 44.48071982281658}, 'test': {'max': 194, 'min': 14, 'mean': 81.79041916167665, 'stdev': 30.848014329091853}, 'validation': {'max': 226, 'min': 15, 'mean': 80.92329317269076, 'stdev': 29.801258355928205}}, 'de': {'train': {'max': 592, 'min': 2, 'mean': 57.345771093602785, 'stdev': 30.320349270668505}, 'test': {'max': 129, 'min': 10, 'mean': 54.02075848303393, 'stdev': 19.536014345575502}, 'validation': {'max': 114, 'min': 13, 'mean': 53.75903614457831, 'stdev': 18.89080248025719}}, 'el': {'train': {'max': 1182, 'min

Downloading (…)okenizer_config.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:21<00:00, 2767.43it/s]


ar-train: 364 - 2 - 38.96246263069707 - 19.790556012438046


100%|██████████| 5010/5010 [00:01<00:00, 2577.41it/s]


ar-test: 106 - 8 - 36.34970059880239 - 13.476595551357542


100%|██████████| 2490/2490 [00:01<00:00, 2044.31it/s]


ar-validation: 81 - 7 - 35.791967871485944 - 13.071386319139553




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:49<00:00, 2310.39it/s]


bg-train: 781 - 4 - 81.7038797867085 - 44.48071982281658


100%|██████████| 5010/5010 [00:01<00:00, 2554.50it/s]


bg-test: 194 - 14 - 81.79041916167665 - 30.848014329091853


100%|██████████| 2490/2490 [00:00<00:00, 2529.35it/s]


bg-validation: 226 - 15 - 80.92329317269076 - 29.801258355928205




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:36<00:00, 2503.36it/s]


de-train: 592 - 2 - 57.345771093602785 - 30.320349270668505


100%|██████████| 5010/5010 [00:01<00:00, 2857.34it/s]


de-test: 129 - 10 - 54.02075848303393 - 19.536014345575502


100%|██████████| 2490/2490 [00:00<00:00, 2865.91it/s]


de-validation: 114 - 13 - 53.75903614457831 - 18.89080248025719




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [03:05<00:00, 2120.52it/s]


el-train: 1182 - 2 - 122.14737892855142 - 66.7354068909114


100%|██████████| 5010/5010 [00:02<00:00, 1947.13it/s]


el-test: 294 - 18 - 117.19201596806387 - 44.810386236879815


100%|██████████| 2490/2490 [00:01<00:00, 2374.95it/s]


el-validation: 258 - 21 - 116.24096385542168 - 43.56449044791693




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:23<00:00, 2738.30it/s]


en-train: 427 - 2 - 37.306866275190856 - 19.01464986471911


100%|██████████| 5010/5010 [00:01<00:00, 3140.03it/s]


en-test: 82 - 10 - 34.41377245508982 - 11.713728146322497


100%|██████████| 2490/2490 [00:00<00:00, 3119.40it/s]


en-validation: 70 - 10 - 34.24578313253012 - 11.529699395986169




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:31<00:00, 2587.53it/s]


es-train: 377 - 2 - 39.40424800484846 - 20.23989709697351


100%|██████████| 5010/5010 [00:01<00:00, 3009.32it/s]


es-test: 87 - 9 - 37.342115768463074 - 13.435865352073634


100%|██████████| 2490/2490 [00:00<00:00, 2960.96it/s]


es-validation: 81 - 8 - 37.04056224899598 - 12.875350866086714




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:35<00:00, 2523.34it/s]


fr-train: 453 - 3 - 42.1538367515317 - 21.623987379816427


100%|██████████| 5010/5010 [00:01<00:00, 2916.29it/s]


fr-test: 105 - 9 - 38.849900199600796 - 13.771641469791282


100%|██████████| 2490/2490 [00:00<00:00, 2491.67it/s]


fr-validation: 80 - 9 - 38.645381526104416 - 13.320875785088122




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [03:15<00:00, 2008.04it/s]


hi-train: 571 - 3 - 61.529444718896265 - 33.729861381646714


100%|██████████| 5010/5010 [00:03<00:00, 1633.20it/s]


hi-test: 123 - 10 - 42.78562874251497 - 15.806715299415583


100%|██████████| 2490/2490 [00:01<00:00, 2359.34it/s]


hi-validation: 108 - 10 - 42.579518072289154 - 15.46184259635872




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:52<00:00, 2277.43it/s]


ru-train: 798 - 4 - 81.23523689718921 - 44.125446423723155


100%|██████████| 5010/5010 [00:01<00:00, 2530.17it/s]


ru-test: 194 - 11 - 79.55788423153693 - 30.34823737371231


100%|██████████| 2490/2490 [00:01<00:00, 2468.32it/s]


ru-validation: 178 - 15 - 78.71807228915662 - 29.037491944942587




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:21<00:00, 2774.59it/s]


sw-train: 374 - 2 - 38.81694516452679 - 20.04504993180702


100%|██████████| 5010/5010 [00:01<00:00, 3000.09it/s]


sw-test: 102 - 10 - 39.60518962075848 - 13.751393070928838


100%|██████████| 2490/2490 [00:01<00:00, 2303.89it/s]


sw-validation: 93 - 6 - 39.0578313253012 - 13.535995034053105




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [03:23<00:00, 1933.93it/s]


th-train: 1760 - 5 - 158.97181832534594 - 87.25909636058833


100%|██████████| 5010/5010 [00:02<00:00, 2410.29it/s]


th-test: 363 - 26 - 141.99061876247504 - 55.129845927522624


100%|██████████| 2490/2490 [00:01<00:00, 2358.95it/s]


th-validation: 314 - 33 - 140.85863453815261 - 52.351868814939536




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:27<00:00, 2663.34it/s]


tr-train: 550 - 2 - 59.557674776293474 - 31.229906506048803


100%|██████████| 5010/5010 [00:01<00:00, 2886.78it/s]


tr-test: 156 - 10 - 62.49620758483034 - 23.071988477031056


100%|██████████| 2490/2490 [00:00<00:00, 2972.94it/s]


tr-validation: 135 - 14 - 61.31004016064257 - 21.843590119658817




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:28<00:00, 2644.74it/s]


ur-train: 351 - 2 - 36.837141140101146 - 18.410681392702774


100%|██████████| 5010/5010 [00:02<00:00, 1935.40it/s]


ur-test: 111 - 9 - 44.00479041916168 - 16.072228965459388


100%|██████████| 2490/2490 [00:00<00:00, 2718.60it/s]


ur-validation: 118 - 10 - 43.7004016064257 - 15.727786012365518




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:48<00:00, 2336.73it/s]


vi-train: 467 - 4 - 46.00200406415043 - 23.063587743769865


100%|██████████| 5010/5010 [00:01<00:00, 2769.61it/s]


vi-test: 108 - 9 - 42.73912175648702 - 14.87209628570509


100%|██████████| 2490/2490 [00:00<00:00, 2640.38it/s]


vi-validation: 107 - 11 - 42.62449799196787 - 14.82635645165467




  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 392702/392702 [02:21<00:00, 2777.36it/s]


zh-train: 470 - 2 - 45.51596121231876 - 23.76952946260383


100%|██████████| 5010/5010 [00:01<00:00, 3581.86it/s]


zh-test: 77 - 8 - 28.10059880239521 - 10.471054087021606


100%|██████████| 2490/2490 [00:00<00:00, 3586.71it/s]


zh-validation: 69 - 5 - 27.842168674698794 - 10.364423477054066
bigscience/bloomz-3b
{'tokenizer': 'bigscience/bloomz-3b', 'ar': {'train': {'max': 364, 'min': 2, 'mean': 38.96246263069707, 'stdev': 19.790556012438046}, 'test': {'max': 106, 'min': 8, 'mean': 36.34970059880239, 'stdev': 13.476595551357542}, 'validation': {'max': 81, 'min': 7, 'mean': 35.791967871485944, 'stdev': 13.071386319139553}}, 'bg': {'train': {'max': 781, 'min': 4, 'mean': 81.7038797867085, 'stdev': 44.48071982281658}, 'test': {'max': 194, 'min': 14, 'mean': 81.79041916167665, 'stdev': 30.848014329091853}, 'validation': {'max': 226, 'min': 15, 'mean': 80.92329317269076, 'stdev': 29.801258355928205}}, 'de': {'train': {'max': 592, 'min': 2, 'mean': 57.345771093602785, 'stdev': 30.320349270668505}, 'test': {'max': 129, 'min': 10, 'mean': 54.02075848303393, 'stdev': 19.536014345575502}, 'validation': {'max': 114, 'min': 13, 'mean': 53.75903614457831, 'stdev': 18.89080248025719}}, 'el': {'train': {'max': 1182, 'min': 2