## AYA



In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import re
import json

# Model
model_name = "CohereForAI/aya-expanse-8b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Languages to process
languages = ["en", "de", "ar", "ru", "be", "mn", "zh-classical"]  # English, German, Arabic, Russian, Belarusian, Mongolian, Chinese

# Splitting a text into sentences using a regex-based approach.
def split_into_sentences(text):
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(\s|\n)'
    sentences = re.split(sentence_endings, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Pre-Tokanization: Grapheme Pair Encoding
def apply_gpe(text):
    return " ".join([text[i:i+2] for i in range(0, len(text), 2)])

# Tokenization Function
def tokenize(texts, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in texts]
    token_counts = [len(tokens) for tokens in tokenized_texts]
    return {
        "tokenized_texts": tokenized_texts, # List of tokenized texts
        "token_counts": token_counts #List of token counts for each text.
    }

# Metric Calculation Functions
# Compression Ratio = Number of Characters / Number of Tokens
def calculate_max_compression_ratio(texts, token_counts):
    compression_ratios = [
        len(text) / token_count if token_count > 0 else 0
        for text, token_count in zip(texts, token_counts)
    ]
    return max(compression_ratios) if compression_ratios else 0

#Min Tokenization Parity = Minimum Token Count / Maximum Token Count
def calculate_min_tokenization_parity(token_counts):
    if not token_counts:
        return 0
    min_tokens = min(token_counts)
    max_tokens = max(token_counts)
    return min_tokens / max_tokens if max_tokens > 0 else 0

# Main Processing Function
# Process a single language: tokenize with and without GPE, calculate metrics, and return results.
def process_language(lang, tokenizer):
    print(f"\nProcessing dataset for language: {lang}")
    try:
        # Loading the dataset and extract the text
        dataset_lang = load_dataset("wikimedia/wikipedia", f"20231101.{lang}", split="train")
        documents = dataset_lang["text"][:1000]

        # Sentence splitting
        sentences = []
        for doc in documents:
            sentences.extend(split_into_sentences(doc))
        sentences = sentences[:1000]

        # Tokenizing without GPE
        tokenized_data_without_gpe = tokenize(sentences, tokenizer)

        # Tokenizing with GPE
        gpe_sentences = [apply_gpe(sentence) for sentence in sentences]
        tokenized_data_with_gpe = tokenize(gpe_sentences, tokenizer)

        # Displaying the first sentence and tokenized forms
        print(f"\nLanguage: {lang}")
        print(f"Original Sentence: {sentences[0]}")
        print(f"Tokenized Sentence (No GPE): {tokenized_data_without_gpe['tokenized_texts'][0]}")
        print(f"Tokenized Sentence (With GPE): {tokenized_data_with_gpe['tokenized_texts'][0]}")

        # Computing metrics
        metrics = {
            "Max Compression (No GPE)": calculate_max_compression_ratio(sentences, tokenized_data_without_gpe["token_counts"]),
            "Min Parity (No GPE)": calculate_min_tokenization_parity(tokenized_data_without_gpe["token_counts"]),
            "Max Compression (With GPE)": calculate_max_compression_ratio(gpe_sentences, tokenized_data_with_gpe["token_counts"]),
            "Min Parity (With GPE)": calculate_min_tokenization_parity(tokenized_data_with_gpe["token_counts"]),
        }

        # Displaying metrics
        print("\nMetrics for language:", lang)
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value}")

        # Storing results
        result = {
            "Language": lang,
            **metrics,
        }

        return result

    except Exception as e:
        print(f"Error processing {lang}: {e}")
        return None

# Execution for All Languages
results = []

for lang in languages:
    result = process_language(lang, tokenizer)
    if result:
        results.append(result)

# Saving results to JSON
results_json = {"results": results}
with open("computations_aya.json", "w") as json_file:
    json.dump(results_json, json_file, indent=4)
print("\nResults saved to computations_aya.json")



Processing dataset for language: en


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]


Language: en
Original Sentence: Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism.
Tokenized Sentence (No GPE): ['An', 'arch', 'ism', 'Ġis', 'Ġa', 'Ġpolitical', 'Ġphilosophy', 'Ġand', 'Ġmovement', 'Ġthat', 'Ġis', 'Ġskeptical', 'Ġof', 'Ġall', 'Ġjust', 'ifications', 'Ġfor', 'Ġauthority', 'Ġand', 'Ġseeks', 'Ġto', 'Ġabolish', 'Ġthe', 'Ġinstitutions', 'Ġit', 'Ġclaims', 'Ġmaintain', 'Ġunnecessary', 'Ġcoercion', 'Ġand', 'Ġhierarchy', ',', 'Ġtypically', 'Ġincluding', 'Ġnation', '-', 'states', ',', 'Ġand', 'Ġcapitalism', '.']
Tokenized Sentence (With GPE): ['An', 'Ġar', 'Ġch', 'Ġis', 'Ġm', 'Ġ', 'Ġis', 'Ġ', 'Ġa', 'Ġ', 'Ġp', 'Ġol', 'Ġit', 'Ġic', 'Ġal', 'Ġ', 'Ġp', 'Ġhi', 'Ġlo', 'Ġso', 'Ġph', 'Ġy', 'Ġ', 'Ġan', 'Ġd', 'Ġ', 'Ġmo', 'Ġve', 'Ġme', 'Ġnt', 'Ġ', 'Ġt', 'Ġha', 'Ġt', 'Ġ', 'Ġis', 'Ġ', 'Ġs', 'Ġke', 'Ġp

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]


Language: de
Original Sentence: Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche Regisseur seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte.
Tokenized Sentence (No GPE): ['Alan', 'ĠSmit', 'hee', 'Ġsteht', 'Ġals', 'ĠPseud', 'onym', 'ĠfÃ¼r', 'Ġeinen', 'Ġfikt', 'iven', 'ĠRegisseur', ',', 'Ġder', 'ĠFilme', 'Ġverantwort', 'et', ',', 'Ġbei', 'Ġdenen', 'Ġder', 'Ġeigent', 'liche', 'ĠRegisseur', 'Ġseinen', 'ĠNamen', 'Ġnicht', 'Ġmit', 'Ġdem', 'ĠWerk', 'Ġin', 'ĠVerbindung', 'Ġgebracht', 'Ġhaben', 'ĠmÃ¶chte', '.']
Tokenized Sentence (With GPE): ['Al', 'Ġan', 'Ġ', 'ĠS', 'Ġmi', 'Ġth', 'Ġee', 'Ġ', 'Ġs', 'Ġte', 'Ġht', 'Ġ', 'Ġa', 'Ġls', 'Ġ', 'ĠP', 'Ġse', 'Ġud', 'Ġon', 'Ġy', 'm', 'Ġ', 'Ġf', 'ĠÃ¼r', 'Ġ', 'Ġe', 'Ġin', 'Ġen', 'Ġ', 'Ġf', 'Ġik', 'Ġti', 'Ġve', 'Ġn', 'Ġ', 'ĠRe', 'Ġgi', 'Ġss', 'Ġeu', 'Ġr', ',', 'Ġ', 'Ġd', 'Ġer', 'Ġ', 'ĠF', 'Ġil', 'Ġme', 'Ġ', 'Ġv', 'Ġer', 'Ġan', 'Ġtw', 'Ġor', 'Ġte', 'Ġt', ',', 'Ġ', 'Ġb', 'Ġei', '

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]


Language: ru
Original Sentence: Литва́ ( ), официальное название — Лито́вская Респу́блика () — государство, расположенное в Северной Европе.
Tokenized Sentence (No GPE): ['ÐĽ', 'Ð¸ÑĤÐ²Ð°', 'Ìģ', 'Ġ(', 'Ġ),', 'ĠÐ¾ÑĦÐ¸ÑĨÐ¸Ð°Ð»ÑĮ', 'Ð½Ð¾Ðµ', 'ĠÐ½Ð°Ð·Ð²Ð°Ð½Ð¸Ðµ', 'Âł', 'âĢĶ', 'ĠÐĽÐ¸ÑĤ', 'Ð¾', 'Ìģ', 'Ð²', 'ÑģÐºÐ°Ñı', 'ĠÐłÐµÑģÐ¿', 'Ñĥ', 'Ìģ', 'Ð±', 'Ð»Ð¸ÐºÐ°', 'Ġ()', 'Âł', 'âĢĶ', 'ĠÐ³Ð¾ÑģÑĥÐ´Ð°ÑĢÑģÑĤÐ²Ð¾', ',', 'ĠÑĢÐ°ÑģÐ¿Ð¾Ð»Ð¾Ð¶ÐµÐ½', 'Ð½Ð¾Ðµ', 'ĠÐ²', 'ĠÐ¡ÐµÐ²ÐµÑĢÐ½Ð¾Ð¹', 'ĠÐķÐ²ÑĢÐ¾Ð¿Ðµ', '.']
Tokenized Sentence (With GPE): ['ÐĽÐ¸', 'ĠÑĤÐ²', 'ĠÐ°', 'Ìģ', 'Ġ', 'Ġ(', 'Ġ', 'Ġ)', 'Ġ,', 'Ġ', 'ĠÐ¾ÑĦ', 'ĠÐ¸', 'ÑĨ', 'ĠÐ¸', 'Ð°', 'ĠÐ»ÑĮ', 'ĠÐ½Ð¾', 'ĠÐµ', 'Ġ', 'ĠÐ½Ð°', 'ĠÐ·Ð²', 'ĠÐ°Ð½', 'ĠÐ¸', 'Ðµ', 'Ġ', 'Âł', 'âĢĶ', 'Ġ', 'ĠÐĽ', 'ĠÐ¸ÑĤ', 'ĠÐ¾', 'Ìģ', 'ĠÐ²Ñģ', 'ĠÐºÐ°', 'ĠÑı', 'Ġ', 'ĠÐłÐµ', 'ĠÑģÐ¿', 'ĠÑĥ', 'Ìģ', 'ĠÐ±Ð»', 'ĠÐ¸', 'Ðº', 'ĠÐ°', 'Ġ', 'Ġ()', 'Ġ', 'Âł', 'âĢĶ', 'Ġ', 'ĠÐ³', 'ĠÐ¾Ñģ', 'ĠÑĥÐ´', 'ĠÐ°ÑĢ', 'ĠÑģÑĤ', 'ĠÐ²Ð¾', 'Ġ,', 'Ġ', 'ĠÑĢÐ°', 'ĠÑģÐ¿', 'ĠÐ¾', 'Ð»', 'ĠÐ¾Ð¶', 'ĠÐµÐ½', 'ĠÐ½Ð¾', 'ĠÐµ', 

# BLOOM

In [3]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import re
import json

# Model
model_name = "bigscience/bloom"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Languages to process
languages = ["en", "de", "ar", "ru", "be", "mn", "zh-classical"]  # English, German, Arabic, Russian, Belarusian, Mongolian, Chinese

# Splitting a text into sentences using a regex-based approach.
def split_into_sentences(text):
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(\s|\n)'
    sentences = re.split(sentence_endings, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Pre-Tokanization: Grapheme Pair Encoding
def apply_gpe(text):
    return " ".join([text[i:i+2] for i in range(0, len(text), 2)])

# Tokenization Function
def tokenize(texts, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in texts] # List of tokenized texts
    token_counts = [len(tokens) for tokens in tokenized_texts] #List of token counts for each text.
    return {
        "tokenized_texts": tokenized_texts, # List of token
        "token_counts": token_counts
    }

# Metric Calculation Functions
# Compression Ratio = Number of Characters / Number of Tokens
def calculate_max_compression_ratio(texts, token_counts):
    compression_ratios = [
        len(text) / token_count if token_count > 0 else 0
        for text, token_count in zip(texts, token_counts)
    ]
    return max(compression_ratios) if compression_ratios else 0


# Min Tokenization Parity = Minimum Token Count / Maximum Token Count
def calculate_min_tokenization_parity(token_counts):
    if not token_counts:
        return 0
    min_tokens = min(token_counts)
    max_tokens = max(token_counts)
    return min_tokens / max_tokens if max_tokens > 0 else 0

# Main Processing Function
# Process a single language: tokenize with and without GPE, calculate metrics, and return results.
def process_language(lang, tokenizer):
    print(f"\nProcessing dataset for language: {lang}")
    try:
        # Loading the dataset and extract the text
        dataset_lang = load_dataset("wikimedia/wikipedia", f"20231101.{lang}", split="train")
        documents = dataset_lang["text"][:1000]

        # Sentence splitting
        sentences = []
        for doc in documents:
            sentences.extend(split_into_sentences(doc))
        sentences = sentences[:1000]

        # Tokenizing without GPE
        tokenized_data_without_gpe = tokenize(sentences, tokenizer)

        # Tokenizing with GPE
        gpe_sentences = [apply_gpe(sentence) for sentence in sentences]
        tokenized_data_with_gpe = tokenize(gpe_sentences, tokenizer)

        # Displaying the first sentence and tokenized forms
        print(f"\nLanguage: {lang}")
        print(f"Original Sentence: {sentences[0]}")
        print(f"Tokenized Sentence (No GPE): {tokenized_data_without_gpe['tokenized_texts'][0]}")
        print(f"Tokenized Sentence (With GPE): {tokenized_data_with_gpe['tokenized_texts'][0]}")

        # Computing metrics
        metrics = {
            "Max Compression (No GPE)": calculate_max_compression_ratio(sentences, tokenized_data_without_gpe["token_counts"]),
            "Min Parity (No GPE)": calculate_min_tokenization_parity(tokenized_data_without_gpe["token_counts"]),
            "Max Compression (With GPE)": calculate_max_compression_ratio(gpe_sentences, tokenized_data_with_gpe["token_counts"]),
            "Min Parity (With GPE)": calculate_min_tokenization_parity(tokenized_data_with_gpe["token_counts"]),
        }

        # Displaying metrics
        print("\nMetrics for language:", lang)
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value}")

        # Storing results
        result = {
            "Language": lang,
            **metrics,
        }

        return result

    except Exception as e:
        print(f"Error processing {lang}: {e}")
        return None

# Execution for All Languages
results = []

for lang in languages:
    result = process_language(lang, tokenizer)
    if result:
        results.append(result)

# Saving results to JSON
results_json = {"results": results}
with open("computations_bloom.json", "w") as json_file:
    json.dump(results_json, json_file, indent=4)
print("\nResults saved to computations_bloom.json")


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]


Processing dataset for language: en


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]


Language: en
Original Sentence: Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism.
Tokenized Sentence (No GPE): ['An', 'arch', 'ism', 'Ġis', 'Ġa', 'Ġpolitical', 'Ġphilosophy', 'Ġand', 'Ġmovement', 'Ġthat', 'Ġis', 'Ġsk', 'ep', 'tical', 'Ġof', 'Ġall', 'Ġjust', 'ifications', 'Ġfor', 'Ġauthority', 'Ġand', 'Ġseeks', 'Ġto', 'Ġabol', 'ish', 'Ġthe', 'Ġinstitutions', 'Ġit', 'Ġclaims', 'Ġmaintain', 'Ġunnecessary', 'Ġcoerc', 'ion', 'Ġand', 'Ġhierarchy', ',', 'Ġtypically', 'Ġincluding', 'Ġnation', '-st', 'ates', ',', 'Ġand', 'Ġcapital', 'ism', '.']
Tokenized Sentence (With GPE): ['An', 'Ġar', 'Ġch', 'Ġis', 'Ġm', 'Ġ', 'Ġis', 'Ġ', 'Ġa', 'Ġ', 'Ġp', 'Ġol', 'Ġit', 'Ġic', 'Ġal', 'Ġ', 'Ġp', 'Ġhi', 'Ġlo', 'Ġso', 'Ġph', 'Ġy', 'Ġ', 'Ġan', 'Ġd', 'Ġ', 'Ġmo', 'Ġve', 'Ġme', 'Ġnt', 'Ġ', 'Ġt', 'Ġha', 'Ġt', 'Ġ', 'Ġis', '

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]


Language: de
Original Sentence: Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche Regisseur seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte.
Tokenized Sentence (No GPE): ['Alan', 'ĠS', 'mit', 'hee', 'Ġste', 'ht', 'Ġals', 'ĠPseud', 'onym', 'ĠfÃ¼r', 'Ġeinen', 'Ġfik', 't', 'iven', 'ĠReg', 'isseur', ',', 'Ġder', 'ĠFil', 'me', 'Ġver', 'ant', 'wort', 'et', ',', 'Ġbei', 'Ġdenen', 'Ġder', 'Ġe', 'igent', 'liche', 'ĠReg', 'isseur', 'Ġse', 'inen', 'ĠNam', 'en', 'Ġnicht', 'Ġmit', 'Ġdem', 'ĠW', 'erk', 'Ġin', 'ĠVerb', 'indung', 'Ġgeb', 'r', 'acht', 'Ġhaben', 'ĠmÃ¶', 'ch', 'te', '.']
Tokenized Sentence (With GPE): ['Al', 'Ġan', 'Ġ', 'ĠS', 'Ġmi', 'Ġth', 'Ġee', 'Ġ', 'Ġs', 'Ġte', 'Ġht', 'Ġ', 'Ġa', 'Ġls', 'Ġ', 'ĠP', 'Ġse', 'Ġud', 'Ġon', 'Ġy', 'm', 'Ġ', 'Ġf', 'ĠÃ¼', 'r', 'Ġ', 'Ġe', 'Ġin', 'Ġen', 'Ġ', 'Ġf', 'Ġik', 'Ġti', 'Ġve', 'Ġn', 'Ġ', 'ĠRe', 'Ġgi', 'Ġss', 'Ġeu', 'Ġr', ',Ġ', 'Ġd', 'Ġer', 'Ġ', 'ĠF', 'Ġil', 'Ġme', 'Ġ', 'Ġv

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]


Language: ru
Original Sentence: Литва́ ( ), официальное название — Лито́вская Респу́блика () — государство, расположенное в Северной Европе.
Tokenized Sentence (No GPE): ['ÐĽ', 'Ð¸ÑĤ', 'Ð²Ð°Ìģ', 'Ġ(', 'Ġ),', 'ĠÐ¾', 'ÑĦÐ¸', 'ÑĨÐ¸', 'Ð°Ð»ÑĮ', 'Ð½Ð¾Ðµ', 'ĠÐ½Ð°', 'Ð·', 'Ð²Ð°', 'Ð½Ð¸Ðµ', 'Âł', 'âĢĶ', 'ĠÐĽ', 'Ð¸ÑĤ', 'Ð¾Ìģ', 'Ð²', 'ÑģÐºÐ°Ñı', 'ĠÐłÐµÑģÐ¿', 'ÑĥÌģ', 'Ð±', 'Ð»Ð¸', 'ÐºÐ°', 'Ġ()', 'Âł', 'âĢĶ', 'ĠÐ³', 'Ð¾Ñģ', 'Ñĥ', 'Ð´Ð°ÑĢ', 'ÑģÑĤÐ²Ð¾', ',', 'ĠÑĢÐ°', 'ÑģÐ¿', 'Ð¾Ð»', 'Ð¾Ð¶', 'ÐµÐ½', 'Ð½Ð¾Ðµ', 'ĠÐ²', 'ĠÐ¡', 'ÐµÐ²', 'ÐµÑĢ', 'Ð½Ð¾Ð¹', 'ĠÐķ', 'Ð²', 'ÑĢ', 'Ð¾Ð¿', 'Ðµ', '.']
Tokenized Sentence (With GPE): ['ÐĽ', 'Ð¸', 'ĠÑĤ', 'Ð²', 'ĠÐ°', 'Ìģ', 'ĠĠ(', 'ĠĠ', ')Ġ,', 'Ġ', 'ĠÐ¾', 'ÑĦ', 'ĠÐ¸', 'ÑĨ', 'ĠÐ¸', 'Ð°', 'ĠÐ»', 'ÑĮ', 'ĠÐ½Ð¾', 'ĠÐµ', 'Ġ', 'ĠÐ½Ð°', 'ĠÐ·', 'Ð²', 'ĠÐ°', 'Ð½', 'ĠÐ¸', 'Ðµ', 'ĠÂł', 'âĢĶ', 'Ġ', 'ĠÐĽ', 'ĠÐ¸', 'ÑĤ', 'ĠÐ¾', 'Ìģ', 'ĠÐ²', 'Ñģ', 'ĠÐºÐ°', 'ĠÑı', 'Ġ', 'ĠÐł', 'Ðµ', 'ĠÑģÐ¿', 'ĠÑĥ', 'Ìģ', 'ĠÐ±', 'Ð»', 'ĠÐ¸', 'Ðº', 'ĠÐ°', 'ĠĠ', '()', 'ĠÂł', 'âĢĶ', 'Ġ', 'ĠÐ³', 'ĠÐ¾Ñģ', 'ĠÑĥ

# EMMA500

In [4]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import re
import json

# Model
model_name = "MaLA-LM/emma-500-llama2-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Languages to process
languages = ["en", "de", "ar", "ru", "be", "mn", "zh-classical"]  # English, German, Arabic, Russian, Belarusian, Mongolian, Chinese

# Splitting a text into sentences using a regex-based approach.
def split_into_sentences(text):
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(\s|\n)'
    sentences = re.split(sentence_endings, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Pre-Tokanization: Grapheme Pair Encoding
def apply_gpe(text):
    return " ".join([text[i:i+2] for i in range(0, len(text), 2)])

# Tokenization Function
def tokenize(texts, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in texts] # List of tokenized texts.
    token_counts = [len(tokens) for tokens in tokenized_texts] # List of token counts for each text.
    return {
        "tokenized_texts": tokenized_texts,
        "token_counts": token_counts
    }

# Metric Calculation Functions
# Compression Ratio = Number of Characters / Number of Tokens
def calculate_max_compression_ratio(texts, token_counts):
    compression_ratios = [
        len(text) / token_count if token_count > 0 else 0
        for text, token_count in zip(texts, token_counts)
    ]
    return max(compression_ratios) if compression_ratios else 0

# Min Tokenization Parity = Minimum Token Count / Maximum Token Count
def calculate_min_tokenization_parity(token_counts):
    if not token_counts:
        return 0
    min_tokens = min(token_counts)
    max_tokens = max(token_counts)
    return min_tokens / max_tokens if max_tokens > 0 else 0

# Main Processing Function
# Process a single language: tokenize with and without GPE, calculate metrics, and return results.
def process_language(lang, tokenizer):
    print(f"\nProcessing dataset for language: {lang}")
    try:
        # Loading the dataset and extract the text
        dataset_lang = load_dataset("wikimedia/wikipedia", f"20231101.{lang}", split="train")
        documents = dataset_lang["text"][:1000]

        # Sentence splitting
        sentences = []
        for doc in documents:
            sentences.extend(split_into_sentences(doc))
        sentences = sentences[:1000]

        # Tokenizing without GPE
        tokenized_data_without_gpe = tokenize(sentences, tokenizer)

        # Tokenizing with GPE
        gpe_sentences = [apply_gpe(sentence) for sentence in sentences]
        tokenized_data_with_gpe = tokenize(gpe_sentences, tokenizer)

        # Displaying the first sentence and tokenized forms
        print(f"\nLanguage: {lang}")
        print(f"Original Sentence: {sentences[0]}")
        print(f"Tokenized Sentence (No GPE): {tokenized_data_without_gpe['tokenized_texts'][0]}")
        print(f"Tokenized Sentence (With GPE): {tokenized_data_with_gpe['tokenized_texts'][0]}")

        # Computing metrics
        metrics = {
            "Max Compression (No GPE)": calculate_max_compression_ratio(sentences, tokenized_data_without_gpe["token_counts"]),
            "Min Parity (No GPE)": calculate_min_tokenization_parity(tokenized_data_without_gpe["token_counts"]),
            "Max Compression (With GPE)": calculate_max_compression_ratio(gpe_sentences, tokenized_data_with_gpe["token_counts"]),
            "Min Parity (With GPE)": calculate_min_tokenization_parity(tokenized_data_with_gpe["token_counts"]),
        }

        # Displaying metrics
        print("\nMetrics for language:", lang)
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value}")

        # Storing results
        result = {
            "Language": lang,
            **metrics,
        }

        return result

    except Exception as e:
        print(f"Error processing {lang}: {e}")
        return None

# Execution for All Languages
results = []

for lang in languages:
    result = process_language(lang, tokenizer)
    if result:
        results.append(result)

# Saving results to JSON
results_json = {"results": results}
with open("computations_emma500.json", "w") as json_file:
    json.dump(results_json, json_file, indent=4)
print("\nResults saved to computations_emma500.json")


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.



Processing dataset for language: en


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]


Language: en
Original Sentence: Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism.
Tokenized Sentence (No GPE): ['▁An', 'arch', 'ism', '▁is', '▁a', '▁political', '▁philosophy', '▁and', '▁movement', '▁that', '▁is', '▁ske', 'pt', 'ical', '▁of', '▁all', '▁just', 'ifications', '▁for', '▁authority', '▁and', '▁see', 'ks', '▁to', '▁abol', 'ish', '▁the', '▁institutions', '▁it', '▁claims', '▁maintain', '▁unnecessary', '▁co', 'erc', 'ion', '▁and', '▁hierarchy', ',', '▁typically', '▁including', '▁nation', '-', 'states', ',', '▁and', '▁capital', 'ism', '.']
Tokenized Sentence (With GPE): ['▁An', '▁ar', '▁ch', '▁is', '▁m', '▁', '▁is', '▁', '▁a', '▁', '▁p', '▁ol', '▁it', '▁ic', '▁al', '▁', '▁p', '▁hi', '▁lo', '▁so', '▁ph', '▁y', '▁', '▁an', '▁d', '▁', '▁mo', '▁ve', '▁me', '▁n', 't', '▁', '▁t', '▁ha', '▁t',

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]


Language: de
Original Sentence: Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche Regisseur seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte.
Tokenized Sentence (No GPE): ['▁Alan', '▁S', 'mit', 'he', 'e', '▁steht', '▁als', '▁Pseud', 'onym', '▁für', '▁einen', '▁f', 'ikt', 'iven', '▁Reg', 'isseur', ',', '▁der', '▁Fil', 'me', '▁ver', 'ant', 'wort', 'et', ',', '▁bei', '▁denen', '▁der', '▁eig', 'ent', 'liche', '▁Reg', 'isseur', '▁seinen', '▁Namen', '▁nicht', '▁mit', '▁dem', '▁Werk', '▁in', '▁Ver', 'bindung', '▁geb', 'racht', '▁haben', '▁m', 'ö', 'chte', '.']
Tokenized Sentence (With GPE): ['▁Al', '▁an', '▁', '▁S', '▁mi', '▁th', '▁e', 'e', '▁', '▁s', '▁te', '▁h', 't', '▁', '▁a', '▁ls', '▁', '▁P', '▁se', '▁ud', '▁on', '▁y', 'm', '▁', '▁f', '▁', 'ür', '▁', '▁e', '▁in', '▁en', '▁', '▁f', '▁ik', '▁ti', '▁ve', '▁n', '▁', '▁Re', '▁gi', '▁ss', '▁eu', '▁r', ',', '▁', '▁d', '▁er', '▁', '▁F', '▁il', '▁me', '▁', '▁v', '▁

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]


Language: ru
Original Sentence: Литва́ ( ), официальное название — Лито́вская Респу́блика () — государство, расположенное в Северной Европе.
Tokenized Sentence (No GPE): ['▁Ли', 'тва', '́', '▁(', '▁),', '▁официаль', 'ное', '▁название', '\xa0', '—', '▁Ли', 'то', '́', 'в', 'ская', '▁Рес', 'пу', '́', 'бли', 'ка', '▁()', '\xa0', '—', '▁государ', 'ство', ',', '▁расположен', 'ное', '▁в', '▁Север', 'ной', '▁Евро', 'пе', '.']
Tokenized Sentence (With GPE): ['▁Ли', '▁т', 'в', '▁а', '́', '▁', '▁(', '▁', '▁)', '▁,', '▁', '▁о', 'ф', '▁и', 'ц', '▁и', 'а', '▁', 'ль', '▁но', '▁е', '▁', '▁на', '▁зв', '▁ан', '▁и', 'е', '▁\xa0', '—', '▁', '▁Л', '▁и', 'т', '▁о', '́', '▁вс', '▁ка', '▁я', '▁', '▁Ре', '▁сп', '▁у', '́', '▁б', 'л', '▁и', 'к', '▁а', '▁', '▁()', '▁\xa0', '—', '▁', '▁г', '▁ос', '▁у', 'д', '▁ар', '▁ст', '▁во', '▁,', '▁', '▁ра', '▁сп', '▁о', 'л', '▁о', 'ж', '▁ен', '▁но', '▁е', '▁', '▁в', '▁', '▁Се', '▁ве', '▁р', 'н', '▁о', 'й', '▁', '▁Е', '▁в', 'р', '▁оп', '▁е', '.']

Metrics for language: ru
  M

# mBERT

In [5]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import re
import json

# Model
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Languages to process
languages = ["en", "de", "ar", "ru", "be", "mn", "zh-classical"]  # English, German, Arabic, Russian, Belarusian, Mongolian, Chinese

# Splitting a text into sentences using a regex-based approach.
def split_into_sentences(text):
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(\s|\n)'
    sentences = re.split(sentence_endings, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Pre-Tokanization: Grapheme Pair Encoding
def apply_gpe(text):
    return " ".join([text[i:i+2] for i in range(0, len(text), 2)])

# Tokenization Function
def tokenize(texts, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in texts] # List of tokenized texts.
    token_counts = [len(tokens) for tokens in tokenized_texts] #List of token counts for each text.
    return {
        "tokenized_texts": tokenized_texts,
        "token_counts": token_counts
    }

# Metric Calculation Functions
# Compression Ratio = Number of Characters / Number of Tokens
def calculate_max_compression_ratio(texts, token_counts):
    compression_ratios = [
        len(text) / token_count if token_count > 0 else 0
        for text, token_count in zip(texts, token_counts)
    ]
    return max(compression_ratios) if compression_ratios else 0


# Min Tokenization Parity = Minimum Token Count / Maximum Token Count
def calculate_min_tokenization_parity(token_counts):
    if not token_counts:
        return 0
    min_tokens = min(token_counts)
    max_tokens = max(token_counts)
    return min_tokens / max_tokens if max_tokens > 0 else 0

# Main Processing Function
# Processing a single language: tokenize with and without GPE, calculate metrics, and return results.
def process_language(lang, tokenizer):
    print(f"\nProcessing dataset for language: {lang}")
    try:
        # Loading the dataset and extract the text
        dataset_lang = load_dataset("wikimedia/wikipedia", f"20231101.{lang}", split="train")
        documents = dataset_lang["text"][:1000]

        # Sentence splitting
        sentences = []
        for doc in documents:
            sentences.extend(split_into_sentences(doc))
        sentences = sentences[:1000]

        # Tokenizing without GPE
        tokenized_data_without_gpe = tokenize(sentences, tokenizer)

        # Tokenizing with GPE
        gpe_sentences = [apply_gpe(sentence) for sentence in sentences]
        tokenized_data_with_gpe = tokenize(gpe_sentences, tokenizer)

        # Displaying the first sentence and tokenized forms
        print(f"\nLanguage: {lang}")
        print(f"Original Sentence: {sentences[0]}")
        print(f"Tokenized Sentence (No GPE): {tokenized_data_without_gpe['tokenized_texts'][0]}")
        print(f"Tokenized Sentence (With GPE): {tokenized_data_with_gpe['tokenized_texts'][0]}")

        # Computing metrics
        metrics = {
            "Max Compression (No GPE)": calculate_max_compression_ratio(sentences, tokenized_data_without_gpe["token_counts"]),
            "Min Parity (No GPE)": calculate_min_tokenization_parity(tokenized_data_without_gpe["token_counts"]),
            "Max Compression (With GPE)": calculate_max_compression_ratio(gpe_sentences, tokenized_data_with_gpe["token_counts"]),
            "Min Parity (With GPE)": calculate_min_tokenization_parity(tokenized_data_with_gpe["token_counts"]),
        }

        # Displaying metrics
        print("\nMetrics for language:", lang)
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value}")

        # Storing results
        result = {
            "Language": lang,
            **metrics,
        }

        return result

    except Exception as e:
        print(f"Error processing {lang}: {e}")
        return None

# Executing for all Languages
results = []

for lang in languages:
    result = process_language(lang, tokenizer)
    if result:
        results.append(result)

# Saving results to JSON
results_json = {"results": results}
with open("computations_mbert.json", "w") as json_file:
    json.dump(results_json, json_file, indent=4)
print("\nResults saved to computations_mbert.json")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]


Processing dataset for language: en


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (601 > 512). Running this sequence through the model will result in indexing errors



Language: en
Original Sentence: Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism.
Tokenized Sentence (No GPE): ['Ana', '##rch', '##ism', 'is', 'a', 'political', 'philosophy', 'and', 'movement', 'that', 'is', 'sk', '##ept', '##ical', 'of', 'all', 'just', '##ification', '##s', 'for', 'authority', 'and', 'seeks', 'to', 'ab', '##olis', '##h', 'the', 'institutions', 'it', 'claims', 'maintain', 'un', '##nec', '##essa', '##ry', 'co', '##er', '##cion', 'and', 'hierarchy', ',', 'typically', 'including', 'nation', '-', 'states', ',', 'and', 'capital', '##ism', '.']
Tokenized Sentence (With GPE): ['An', 'ar', 'ch', 'is', 'm', 'is', 'a', 'p', 'ol', 'it', 'i', '##c', 'al', 'p', 'hi', 'lo', 'so', 'ph', 'y', 'an', 'd', 'mo', 've', 'me', 'nt', 't', 'ha', 't', 'is', 's', 'ke', 'pt', 'i', '##c', 'al', 'o', 'f

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]


Language: de
Original Sentence: Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche Regisseur seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte.
Tokenized Sentence (No GPE): ['Alan', 'Smith', '##ee', 'steht', 'als', 'Pseudonym', 'für', 'einen', 'fik', '##tiven', 'Regisseur', ',', 'der', 'Filme', 'vera', '##nt', '##wort', '##et', ',', 'bei', 'denen', 'der', 'eigentliche', 'Regisseur', 'seinen', 'Namen', 'nicht', 'mit', 'dem', 'Werk', 'in', 'Verbindung', 'gebracht', 'haben', 'möchte', '.']
Tokenized Sentence (With GPE): ['Al', 'an', 'S', 'mi', 'th', 'ee', 's', 'te', 'h', '##t', 'a', 'ls', 'P', 'se', 'ud', 'on', 'ym', 'f', 'ü', '##r', 'e', 'in', 'en', 'f', 'ik', 'ti', 've', 'n', 'Re', 'gi', 'ss', 'eu', 'r', ',', 'd', 'er', 'F', 'il', 'me', 'v', 'er', 'an', 't', '##w', 'or', 'te', 't', ',', 'b', 'ei', 'd', 'en', 'en', 'd', 'er', 'e', 'ig', 'en', 't', '##l', 'i', '##c', 'he', 'R', 'e', '##g', 'is', 'se', 'ur', '

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]


Language: ru
Original Sentence: Литва́ ( ), официальное название — Лито́вская Респу́блика () — государство, расположенное в Северной Европе.
Tokenized Sentence (No GPE): ['Литва', '##́', '(', ')', ',', 'официально', '##е', 'название', '[UNK]', 'Ли', '##то', '##́', '##вская', 'Р', '##ес', '##пу', '##́', '##бл', '##ика', '(', ')', '[UNK]', 'государство', ',', 'расположен', '##ное', 'в', 'Северной', 'Европе', '.']
Tokenized Sentence (With GPE): ['Ли', 'т', '##в', 'а', '##́', '(', ')', ',', 'о', '##ф', 'и', '##ц', 'и', '##а', 'л', '##ь', 'но', 'е', 'на', 'зв', 'ан', 'ие', '[UNK]', 'Л', 'и', '##т', 'о', '##́', 'в', '##с', 'ка', 'я', 'Р', '##е', 'с', '##п', 'у', '##́', 'б', '##л', 'и', '##к', 'а', '(', ')', '[UNK]', 'г', 'ос', 'у', '##д', 'ар', 'ст', 'во', ',', 'р', '##а', 'с', '##п', 'ол', 'о', '##ж', 'е', '##н', 'но', 'е', 'в', 'Се', 'в', '##е', 'р', '##н', 'ой', 'Е', 'в', '##р', 'о', '##п', 'е', '.']

Metrics for language: ru
  Max Compression (No GPE): 6.315789473684211
  Min Parity (No

# MGPT

In [6]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import re
import json

# Model
model_name = "ai-forever/mGPT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Languages to process
languages = ["en", "de", "ar", "ru", "be", "mn", "zh-classical"]  # English, German, Arabic, Russian, Belarusian, Mongolian, Chinese

# Splitting a text into sentences using a regex-based approach.
def split_into_sentences(text):
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(\s|\n)'
    sentences = re.split(sentence_endings, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Pre-Tokanization: Grapheme Pair Encoding
def apply_gpe(text):
    return " ".join([text[i:i+2] for i in range(0, len(text), 2)])

# Tokenization Function
def tokenize(texts, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in texts] # List of tokenized texts.
    token_counts = [len(tokens) for tokens in tokenized_texts] # List of token counts for each text.
    return {
        "tokenized_texts": tokenized_texts,
        "token_counts": token_counts
    }

# Metric Calculation Functions
# Compression Ratio = Number of Characters / Number of Tokens
def calculate_max_compression_ratio(texts, token_counts):
    compression_ratios = [
        len(text) / token_count if token_count > 0 else 0
        for text, token_count in zip(texts, token_counts)
    ]
    return max(compression_ratios) if compression_ratios else 0

# Min Tokenization Parity = Minimum Token Count / Maximum Token Count
def calculate_min_tokenization_parity(token_counts):
    if not token_counts:
        return 0
    min_tokens = min(token_counts)
    max_tokens = max(token_counts)
    return min_tokens / max_tokens if max_tokens > 0 else 0

# Main Processing Function
# Processing a single language: tokenize with and without GPE, calculate metrics, and return results.
def process_language(lang, tokenizer):
    print(f"\nProcessing dataset for language: {lang}")
    try:
        # Loading the dataset and extract the text
        dataset_lang = load_dataset("wikimedia/wikipedia", f"20231101.{lang}", split="train")
        documents = dataset_lang["text"][:1000]

        # Sentence splitting
        sentences = []
        for doc in documents:
            sentences.extend(split_into_sentences(doc))
        sentences = sentences[:1000]

        # Tokenizing without GPE
        tokenized_data_without_gpe = tokenize(sentences, tokenizer)

        # Tokenizing with GPE
        gpe_sentences = [apply_gpe(sentence) for sentence in sentences]
        tokenized_data_with_gpe = tokenize(gpe_sentences, tokenizer)

        # Displaying the first sentence and tokenized forms
        print(f"\nLanguage: {lang}")
        print(f"Original Sentence: {sentences[0]}")
        print(f"Tokenized Sentence (No GPE): {tokenized_data_without_gpe['tokenized_texts'][0]}")
        print(f"Tokenized Sentence (With GPE): {tokenized_data_with_gpe['tokenized_texts'][0]}")

        # Computing metrics
        metrics = {
            "Max Compression (No GPE)": calculate_max_compression_ratio(sentences, tokenized_data_without_gpe["token_counts"]),
            "Min Parity (No GPE)": calculate_min_tokenization_parity(tokenized_data_without_gpe["token_counts"]),
            "Max Compression (With GPE)": calculate_max_compression_ratio(gpe_sentences, tokenized_data_with_gpe["token_counts"]),
            "Min Parity (With GPE)": calculate_min_tokenization_parity(tokenized_data_with_gpe["token_counts"]),
        }

        # Displaying metrics
        print("\nMetrics for language:", lang)
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value}")

        # Store results
        result = {
            "Language": lang,
            **metrics,
        }

        return result

    except Exception as e:
        print(f"Error processing {lang}: {e}")
        return None

# Execution for all Languages
results = []

for lang in languages:
    result = process_language(lang, tokenizer)
    if result:
        results.append(result)

# Saving results to JSON
results_json = {"results": results}
with open("computations_mgpt.json", "w") as json_file:
    json.dump(results_json, json_file, indent=4)
print("\nResults saved to computations_mgpt.json")


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/606 [00:00<?, ?B/s]


Processing dataset for language: en


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]


Language: en
Original Sentence: Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism.
Tokenized Sentence (No GPE): ['An', 'arch', 'ism', 'Ġis', 'Ġa', 'Ġpolitical', 'Ġphilosophy', 'Ġand', 'Ġmovement', 'Ġthat', 'Ġis', 'Ġskept', 'ical', 'Ġof', 'Ġall', 'Ġjust', 'ifications', 'Ġfor', 'Ġauthority', 'Ġand', 'Ġseeks', 'Ġto', 'Ġabol', 'ish', 'Ġthe', 'Ġinstitutions', 'Ġit', 'Ġclaims', 'Ġmaintain', 'Ġun', 'nec', 'ess', 'ary', 'Ġco', 'erc', 'ion', 'Ġand', 'Ġhier', 'archy', ',', 'Ġtypically', 'Ġincluding', 'Ġnation', '-', 'st', 'ates', ',', 'Ġand', 'Ġcapit', 'alism', '.']
Tokenized Sentence (With GPE): ['An', 'Ġar', 'Ġch', 'Ġis', 'Ġm', 'Ġ', 'Ġis', 'Ġ', 'Ġa', 'Ġ', 'Ġp', 'Ġol', 'Ġit', 'Ġic', 'Ġal', 'Ġ', 'Ġp', 'Ġhi', 'Ġlo', 'Ġso', 'Ġph', 'Ġy', 'Ġ', 'Ġan', 'Ġd', 'Ġ', 'Ġmo', 'Ġve', 'Ġme', 'Ġn', 't', 'Ġ', 'Ġt', 'Ġ

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3408 > 2048). Running this sequence through the model will result in indexing errors



Language: de
Original Sentence: Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche Regisseur seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte.
Tokenized Sentence (No GPE): ['Alan', 'ĠSm', 'ithe', 'e', 'Ġsteht', 'Ġals', 'ĠPseudonym', 'ĠfÃ¼r', 'Ġeinen', 'Ġfikt', 'iven', 'ĠRegisseur', ',', 'Ġder', 'ĠFilme', 'Ġverantwort', 'et', ',', 'Ġbei', 'Ġdenen', 'Ġder', 'Ġeigentliche', 'ĠRegisseur', 'Ġseinen', 'ĠNamen', 'Ġnicht', 'Ġmit', 'Ġdem', 'ĠWerk', 'Ġin', 'ĠVerbindung', 'Ġgebracht', 'Ġhaben', 'ĠmÃ¶chte', '.']
Tokenized Sentence (With GPE): ['Al', 'Ġan', 'Ġ', 'ĠS', 'Ġmi', 'Ġth', 'Ġe', 'e', 'Ġ', 'Ġs', 'Ġte', 'Ġh', 't', 'Ġ', 'Ġa', 'Ġl', 's', 'Ġ', 'ĠP', 'Ġse', 'Ġud', 'Ġon', 'Ġy', 'm', 'Ġ', 'Ġf', 'ĠÃ¼r', 'Ġ', 'Ġe', 'Ġin', 'Ġen', 'Ġ', 'Ġf', 'Ġik', 'Ġti', 'Ġve', 'Ġn', 'Ġ', 'ĠRe', 'Ġgi', 'Ġs', 's', 'Ġeu', 'Ġr', ',', 'Ġ', 'Ġd', 'Ġer', 'Ġ', 'ĠF', 'Ġil', 'Ġme', 'Ġ', 'Ġv', 'Ġer', 'Ġan', 'Ġtw', 'Ġor', 'Ġte', 'Ġt', ',', 'Ġ', 'Ġ

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]


Language: ru
Original Sentence: Литва́ ( ), официальное название — Лито́вская Респу́блика () — государство, расположенное в Северной Европе.
Tokenized Sentence (No GPE): ['ÐĽÐ¸ÑĤ', 'Ð²Ð°', 'Ìģ', 'Ġ(', 'Ġ),', 'ĠÐ¾ÑĦÐ¸ÑĨÐ¸Ð°Ð»ÑĮ', 'Ð½Ð¾Ðµ', 'ĠÐ½Ð°Ð·Ð²Ð°Ð½Ð¸Ðµ', 'Âł', 'âĢĶ', 'ĠÐĽ', 'Ð¸ÑĤÐ¾', 'Ìģ', 'Ð²', 'ÑģÐºÐ°Ñı', 'ĠÐł', 'ÐµÑģÐ¿', 'Ñĥ', 'Ìģ', 'Ð±', 'Ð»Ð¸ÐºÐ°', 'Ġ()', 'Âł', 'âĢĶ', 'ĠÐ³Ð¾ÑģÑĥÐ´Ð°ÑĢÑģÑĤÐ²Ð¾', ',', 'ĠÑĢÐ°ÑģÐ¿Ð¾Ð»Ð¾Ð¶ÐµÐ½', 'Ð½Ð¾Ðµ', 'ĠÐ²', 'ĠÐ¡ÐµÐ²ÐµÑĢÐ½Ð¾Ð¹', 'ĠÐķÐ²ÑĢÐ¾Ð¿Ðµ', '.']
Tokenized Sentence (With GPE): ['ÐĽ', 'Ð¸', 'ĠÑĤ', 'Ð²', 'ĠÐ°', 'Ìģ', 'Ġ', 'Ġ(', 'Ġ', 'Ġ)', 'Ġ,', 'Ġ', 'ĠÐ¾ÑĦ', 'ĠÐ¸', 'ÑĨ', 'ĠÐ¸', 'Ð°', 'ĠÐ»ÑĮ', 'ĠÐ½Ð¾', 'ĠÐµ', 'Ġ', 'ĠÐ½Ð°', 'ĠÐ·Ð²', 'ĠÐ°Ð½', 'ĠÐ¸', 'Ðµ', 'Ġ', 'Âł', 'âĢĶ', 'Ġ', 'ĠÐĽ', 'ĠÐ¸ÑĤ', 'ĠÐ¾', 'Ìģ', 'ĠÐ²Ñģ', 'ĠÐºÐ°', 'ĠÑı', 'Ġ', 'ĠÐłÐµ', 'ĠÑģÐ¿', 'ĠÑĥ', 'Ìģ', 'ĠÐ±Ð»', 'ĠÐ¸', 'Ðº', 'ĠÐ°', 'Ġ', 'Ġ()', 'Ġ', 'Âł', 'âĢĶ', 'Ġ', 'ĠÐ³', 'ĠÐ¾Ñģ', 'ĠÑĥÐ´', 'ĠÐ°ÑĢ', 'ĠÑģÑĤ', 'ĠÐ²Ð¾', 'Ġ,', 'Ġ', 'ĠÑĢÐ°', 'ĠÑģÐ¿', 'ĠÐ¾', 'Ð»', 'ĠÐ¾Ð¶', 'ĠÐµ', 'Ð½', 

# MT5

In [7]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import re
import json

# Model
model_name = "google/mt5-small" # base
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Languages to process
languages = ["en", "de", "ar", "ru", "be", "mn", "zh-classical"]  # English, German, Arabic, Russian, Belarusian, Mongolian, Chinese

# Splitting a text into sentences using a regex-based approach.
def split_into_sentences(text):
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(\s|\n)'
    sentences = re.split(sentence_endings, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Pre-Tokanization: Grapheme Pair Encoding
def apply_gpe(text):
    return " ".join([text[i:i+2] for i in range(0, len(text), 2)])

# Tokenization Function
def tokenize(texts, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in texts] # List of tokenized texts.
    token_counts = [len(tokens) for tokens in tokenized_texts] # List of token counts for each text.
    return {
        "tokenized_texts": tokenized_texts,
        "token_counts": token_counts
    }

# Metric Calculation Functions
# Compression Ratio = Number of Characters / Number of Tokens
def calculate_max_compression_ratio(texts, token_counts):
    compression_ratios = [
        len(text) / token_count if token_count > 0 else 0
        for text, token_count in zip(texts, token_counts)
    ]
    return max(compression_ratios) if compression_ratios else 0

# Min Tokenization Parity = Minimum Token Count / Maximum Token Count
def calculate_min_tokenization_parity(token_counts):
    if not token_counts:
        return 0
    min_tokens = min(token_counts)
    max_tokens = max(token_counts)
    return min_tokens / max_tokens if max_tokens > 0 else 0

# Main Processing Function
# Processing a single language: tokenize with and without GPE, calculate metrics, and return results.
def process_language(lang, tokenizer):
    print(f"\nProcessing dataset for language: {lang}")
    try:
        # Loading the dataset and extract the text
        dataset_lang = load_dataset("wikimedia/wikipedia", f"20231101.{lang}", split="train")
        documents = dataset_lang["text"][:1000]

        # Sentence splitting
        sentences = []
        for doc in documents:
            sentences.extend(split_into_sentences(doc))
        sentences = sentences[:1000]

        # Tokenizing without GPE
        tokenized_data_without_gpe = tokenize(sentences, tokenizer)

        # Tokenizing with GPE
        gpe_sentences = [apply_gpe(sentence) for sentence in sentences]
        tokenized_data_with_gpe = tokenize(gpe_sentences, tokenizer)

        # Displaying the first sentence and tokenized forms
        print(f"\nLanguage: {lang}")
        print(f"Original Sentence: {sentences[0]}")
        print(f"Tokenized Sentence (No GPE): {tokenized_data_without_gpe['tokenized_texts'][0]}")
        print(f"Tokenized Sentence (With GPE): {tokenized_data_with_gpe['tokenized_texts'][0]}")

        # Computing metrics
        metrics = {
            "Max Compression (No GPE)": calculate_max_compression_ratio(sentences, tokenized_data_without_gpe["token_counts"]),
            "Min Parity (No GPE)": calculate_min_tokenization_parity(tokenized_data_without_gpe["token_counts"]),
            "Max Compression (With GPE)": calculate_max_compression_ratio(gpe_sentences, tokenized_data_with_gpe["token_counts"]),
            "Min Parity (With GPE)": calculate_min_tokenization_parity(tokenized_data_with_gpe["token_counts"]),
        }

        # Displaying metrics
        print("\nMetrics for language:", lang)
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value}")

        # Store results
        result = {
            "Language": lang,
            **metrics,
        }

        return result

    except Exception as e:
        print(f"Error processing {lang}: {e}")
        return None

# Execution for all Languages
results = []

for lang in languages:
    result = process_language(lang, tokenizer)
    if result:
        results.append(result)

# Saving results to JSON
results_json = {"results": results}
with open("computations_mt5.json", "w") as json_file:
    json.dump(results_json, json_file, indent=4)
print("\nResults saved to computations_mt5.json")


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Processing dataset for language: en


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]


Language: en
Original Sentence: Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism.
Tokenized Sentence (No GPE): ['▁An', 'arch', 'ism', '▁is', '▁', 'a', '▁', 'political', '▁', 'philosophy', '▁and', '▁movement', '▁that', '▁is', '▁', 'ske', 'ptical', '▁of', '▁all', '▁justifica', 'tions', '▁for', '▁', 'authority', '▁and', '▁se', 'eks', '▁to', '▁ab', 'olish', '▁the', '▁institutions', '▁it', '▁', 'claims', '▁', 'maintain', '▁un', 'necessary', '▁co', 'er', 'cion', '▁and', '▁hierarch', 'y', ',', '▁typ', 'ically', '▁', 'including', '▁nation', '-', 'states', ',', '▁and', '▁', 'capitalism', '.']
Tokenized Sentence (With GPE): ['▁An', '▁ar', '▁ch', '▁is', '▁m', '▁is', '▁', 'a', '▁p', '▁ol', '▁it', '▁', 'ic', '▁al', '▁p', '▁hi', '▁lo', '▁so', '▁ph', '▁', 'y', '▁an', '▁d', '▁mo', '▁ve', '▁me', '▁', 'nt', '

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]


Language: de
Original Sentence: Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche Regisseur seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte.
Tokenized Sentence (No GPE): ['▁Alan', '▁Smith', 'e', 'e', '▁', 'steht', '▁als', '▁Pseudo', 'nym', '▁für', '▁', 'einen', '▁fik', 'tive', 'n', '▁R', 'egisseur', ',', '▁der', '▁Filme', '▁ver', 'antworte', 't', ',', '▁bei', '▁den', 'en', '▁der', '▁eigen', 'tliche', '▁R', 'egisseur', '▁sein', 'en', '▁Namen', '▁nicht', '▁mit', '▁dem', '▁Werk', '▁in', '▁Verbindung', '▁', 'gebracht', '▁haben', '▁mö', 'chte', '.']
Tokenized Sentence (With GPE): ['▁Al', '▁an', '▁S', '▁mi', '▁th', '▁', 'e', 'e', '▁', 's', '▁te', '▁h', 't', '▁', 'a', '▁', 'l', 's', '▁P', '▁se', '▁ud', '▁on', '▁', 'ym', '▁f', '▁', 'ür', '▁', 'e', '▁in', '▁en', '▁f', '▁ik', '▁ti', '▁ve', '▁', 'n', '▁Re', '▁gi', '▁', 's', 's', '▁eu', '▁', 'r', ',', '▁d', '▁er', '▁F', '▁il', '▁me', '▁v', '▁er', '▁an', '▁tw', '▁or

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]


Language: ru
Original Sentence: Литва́ ( ), официальное название — Лито́вская Респу́блика () — государство, расположенное в Северной Европе.
Tokenized Sentence (No GPE): ['▁Лит', 'ва', '́', '▁(', '▁', '),', '▁офици', 'альное', '▁на', 'звание', '▁—', '▁Ли', 'то', '́', 'в', 'ская', '▁Рес', 'пу', '́', 'бли', 'ка', '▁(', ')', '▁—', '▁государств', 'о', ',', '▁', 'расположен', 'ное', '▁в', '▁Север', 'ной', '▁', 'Европе', '.']
Tokenized Sentence (With GPE): ['▁Ли', '▁тв', '▁', 'а', '́', '▁(', '▁', ')', '▁', ',', '▁оф', '▁', 'иц', '▁', 'и', 'а', '▁', 'ль', '▁но', '▁', 'е', '▁на', '▁зв', '▁', 'ан', '▁', 'ие', '▁—', '▁Л', '▁', 'ит', '▁', 'о', '́', '▁вс', '▁', 'ка', '▁', 'я', '▁Ре', '▁сп', '▁у', '́', '▁бл', '▁', 'ик', '▁', 'а', '▁(', ')', '▁—', '▁г', '▁ос', '▁', 'уд', '▁ар', '▁ст', '▁во', '▁', ',', '▁ра', '▁сп', '▁ол', '▁', 'ож', '▁', 'ен', '▁но', '▁', 'е', '▁в', '▁Се', '▁ве', '▁', 'р', 'н', '▁', 'ой', '▁Е', '▁вр', '▁оп', '▁', 'е', '.']

Metrics for language: ru
  Max Compression (No GPE): 5.5
 

# XGLM

In [8]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import re
import json

# Model
model_name = "facebook/xglm-564M"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Languages to process
languages = ["en", "de", "ar", "ru", "be", "mn", "zh-classical"]  # English, German, Arabic, Russian, Belarusian, Mongolian, Chinese

# Splitting a text into sentences using a regex-based approach.
def split_into_sentences(text):
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(\s|\n)'
    sentences = re.split(sentence_endings, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Pre-Tokanization: Grapheme Pair Encoding
def apply_gpe(text):
    return " ".join([text[i:i+2] for i in range(0, len(text), 2)])

# Tokenization Function
def tokenize(texts, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in texts] # List of tokenized texts.
    token_counts = [len(tokens) for tokens in tokenized_texts] # List of token counts for each text.
    return {
        "tokenized_texts": tokenized_texts,
        "token_counts": token_counts
    }

# Metric Calculation Functions
# Compression Ratio = Number of Characters / Number of Tokens
def calculate_max_compression_ratio(texts, token_counts):
    compression_ratios = [
        len(text) / token_count if token_count > 0 else 0
        for text, token_count in zip(texts, token_counts)
    ]
    return max(compression_ratios) if compression_ratios else 0

# Min Tokenization Parity = Minimum Token Count / Maximum Token Count
def calculate_min_tokenization_parity(token_counts):
    if not token_counts:
        return 0
    min_tokens = min(token_counts)
    max_tokens = max(token_counts)
    return min_tokens / max_tokens if max_tokens > 0 else 0

# Main Processing Function
# Processing a single language: tokenize with and without GPE, calculate metrics, and return results.
def process_language(lang, tokenizer):
    print(f"\nProcessing dataset for language: {lang}")
    try:
        # Loading the dataset and extract the text
        dataset_lang = load_dataset("wikimedia/wikipedia", f"20231101.{lang}", split="train")
        documents = dataset_lang["text"][:1000]

        # Sentence splitting
        sentences = []
        for doc in documents:
            sentences.extend(split_into_sentences(doc))
        sentences = sentences[:1000]

        # Tokenizing without GPE
        tokenized_data_without_gpe = tokenize(sentences, tokenizer)

        # Tokenizing with GPE
        gpe_sentences = [apply_gpe(sentence) for sentence in sentences]
        tokenized_data_with_gpe = tokenize(gpe_sentences, tokenizer)

        # Displaying the first sentence and tokenized forms
        print(f"\nLanguage: {lang}")
        print(f"Original Sentence: {sentences[0]}")
        print(f"Tokenized Sentence (No GPE): {tokenized_data_without_gpe['tokenized_texts'][0]}")
        print(f"Tokenized Sentence (With GPE): {tokenized_data_with_gpe['tokenized_texts'][0]}")

        # Computing metrics
        metrics = {
            "Max Compression (No GPE)": calculate_max_compression_ratio(sentences, tokenized_data_without_gpe["token_counts"]),
            "Min Parity (No GPE)": calculate_min_tokenization_parity(tokenized_data_without_gpe["token_counts"]),
            "Max Compression (With GPE)": calculate_max_compression_ratio(gpe_sentences, tokenized_data_with_gpe["token_counts"]),
            "Min Parity (With GPE)": calculate_min_tokenization_parity(tokenized_data_with_gpe["token_counts"]),
        }

        # Displaying metrics
        print("\nMetrics for language:", lang)
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value}")

        # Store results
        result = {
            "Language": lang,
            **metrics,
        }

        return result

    except Exception as e:
        print(f"Error processing {lang}: {e}")
        return None

# Execution for all Languages
results = []

for lang in languages:
    result = process_language(lang, tokenizer)
    if result:
        results.append(result)

# Saving results to JSON
results_json = {"results": results}
with open("computations_xglm.json", "w") as json_file:
    json.dump(results_json, json_file, indent=4)
print("\nResults saved to computations_xglm.json")


tokenizer_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.03M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/276 [00:00<?, ?B/s]


Processing dataset for language: en


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]


Language: en
Original Sentence: Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism.
Tokenized Sentence (No GPE): ['▁An', 'arch', 'ism', '▁is', '▁a', '▁political', '▁philosophy', '▁and', '▁movement', '▁that', '▁is', '▁skeptic', 'al', '▁of', '▁all', '▁justification', 's', '▁for', '▁authority', '▁and', '▁seek', 's', '▁to', '▁ab', 'olish', '▁the', '▁institutions', '▁it', '▁claims', '▁maintain', '▁unnecessary', '▁co', 'er', 'cion', '▁and', '▁hier', 'archy', ',', '▁typically', '▁including', '▁nation', '-', 'state', 's', ',', '▁and', '▁capitalism', '.']
Tokenized Sentence (With GPE): ['▁An', '▁ar', '▁ch', '▁is', '▁m', '▁is', '▁a', '▁p', '▁ol', '▁it', '▁ic', '▁al', '▁p', '▁hi', '▁lo', '▁so', '▁ph', '▁y', '▁an', '▁d', '▁mo', '▁ve', '▁me', '▁', 'nt', '▁t', '▁ha', '▁t', '▁is', '▁s', '▁ke', '▁pt', '▁ic', 

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]


Language: de
Original Sentence: Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der eigentliche Regisseur seinen Namen nicht mit dem Werk in Verbindung gebracht haben möchte.
Tokenized Sentence (No GPE): ['▁Alan', '▁Smith', 'ee', '▁steht', '▁als', '▁P', 'seudo', 'nym', '▁für', '▁einen', '▁fik', 'tive', 'n', '▁R', 'egisseur', ',', '▁der', '▁Filme', '▁ver', 'antwortet', ',', '▁bei', '▁denen', '▁der', '▁eigentlich', 'e', '▁R', 'egisseur', '▁seinen', '▁Namen', '▁nicht', '▁mit', '▁dem', '▁Werk', '▁in', '▁Verbindung', '▁gebracht', '▁haben', '▁möchte', '.']
Tokenized Sentence (With GPE): ['▁Al', '▁an', '▁S', '▁mi', '▁th', '▁ee', '▁s', '▁te', '▁ht', '▁a', '▁l', 's', '▁P', '▁se', '▁ud', '▁on', '▁ym', '▁f', '▁', 'ür', '▁e', '▁in', '▁en', '▁f', '▁ik', '▁ti', '▁ve', '▁n', '▁Re', '▁gi', '▁s', 's', '▁eu', '▁r', ',', '▁d', '▁er', '▁F', '▁il', '▁me', '▁v', '▁er', '▁an', '▁tw', '▁or', '▁te', '▁t', ',', '▁b', '▁ei', '▁d', '▁en', '▁en', '▁d', '▁er', '▁e',

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]


Language: ru
Original Sentence: Литва́ ( ), официальное название — Лито́вская Респу́блика () — государство, расположенное в Северной Европе.
Tokenized Sentence (No GPE): ['▁Литв', 'а', '́', '▁(', '▁', '),', '▁официально', 'е', '▁название', '▁—', '▁Ли', 'то', '́', 'в', 'ская', '▁Ре', 'сп', 'у', '́', 'блик', 'а', '▁(', ')', '▁—', '▁государство', ',', '▁расположен', 'ное', '▁в', '▁Север', 'ной', '▁Европе', '.']
Tokenized Sentence (With GPE): ['▁Ли', '▁тв', '▁а', '́', '▁(', '▁)', '▁', ',', '▁оф', '▁', 'иц', '▁и', 'а', '▁', 'ль', '▁но', '▁е', '▁на', '▁зв', '▁ан', '▁и', 'е', '▁—', '▁Л', '▁', 'ит', '▁о', '́', '▁вс', '▁ка', '▁я', '▁Ре', '▁сп', '▁у', '́', '▁бл', '▁', 'ик', '▁а', '▁(', ')', '▁—', '▁г', '▁ос', '▁уд', '▁ар', '▁ст', '▁во', '▁', ',', '▁ра', '▁сп', '▁ол', '▁', 'ож', '▁', 'ен', '▁но', '▁е', '▁в', '▁Се', '▁ве', '▁р', 'н', '▁ой', '▁Е', '▁вр', '▁оп', '▁е', '.']

Metrics for language: ru
  Max Compression (No GPE): 7.0
  Min Parity (No GPE): 0.005813953488372093
  Max Compression (With G