In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

## Loading

In [None]:
# Load Tokenizers
with open("../user_config.json", "r") as f:
    config = json.load(f)

token_1 = config["huggingface_token"]['token_1'] # from the main acoount
token_2 = config["huggingface_token"]['token_2'] # from the sub account

tokenizer_configs = {
    # "google/gemma-2-2b-it": token_1,
    "google/gemma-2-9b-it": token_1,
    "google/gemma-3-12b-it": token_1,
    # "meta-llama/Llama-3.2-1B-Instruct": token_2,
    # "meta-llama/Llama-3.2-3B-Instruct": token_2,
    "meta-llama/Llama-3.1-8B-Instruct": token_2,
    "meta-llama/Llama-2-7b-chat-hf": token_1,
    "microsoft/phi-4": None,
    # "microsoft/Phi-4-mini-instruct": None,
    "microsoft/Phi-3-small-8k-instruct": None,
    # "microsoft/Phi-3-mini-4k-instruct": None,
    # "microsoft/Phi-3.5-mini-instruct": None,
    "bigscience/bloom-7b1": None,
    "Tower-Babel/Babel-9B-Chat": None
}

In [None]:
# Load Testsets
seed = 2025
n_sample = 100

languages = {"eng_Latn": "English", "kor_Hang": "Korean", "deu_Latn": "German"}
samples = {}

for lang_code, lang_name in languages.items():
    dataset = load_dataset("facebook/flores", lang_code, trust_remote_code=True)
    samples[lang_name] = dataset["dev"].shuffle(seed=seed).select(range(n_sample))["sentence"]
    
df = pd.DataFrame()
for lang_name, sample_data in samples.items():
    df[lang_name] = [s for s in sample_data]

In [None]:
# Load DownStream Performance Results - MMLU-Pro & FLORES

html_file_path = '/home/hyujang/multilingual-inner-lexicon/data/results.html'
tables = pd.read_html(html_file_path)
df_results = tables[0]
df_results = df_results.rename(columns={"Unnamed: 0": "Benchmark"})

tokenizer_mmlupro = df_results.iloc[41,1:].to_dict() # https://huggingface.co/spaces/open-llm-leaderboard/comparator
del(df_results)

tokenizer_configs.keys() - tokenizer_mmlupro.keys()
tokenizer_mmlupro['google/gemma-3-12b-it'] = 0.606 # https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro
tokenizer_mmlupro['Tower-Babel/Babel-9B-Chat'] = 0 # No information available

tokenizer_flores = { # https://huggingface.co/Tower-Babel/Babel-9B-Chat
    "google/gemma-2-2b-it": 0,
    "google/gemma-2-9b-it": 0.548,
    "google/gemma-3-12b-it": 0, #0.460 #https://huggingface.co/google/gemma-3-12b-it#multilingual
    "meta-llama/Llama-3.2-1B-Instruct": 0,
    "meta-llama/Llama-3.2-3B-Instruct": 0,
    "meta-llama/Llama-3.1-8B-Instruct": 0.473,
    "microsoft/phi-4": 0,
    "microsoft/Phi-4-mini-instruct": 0,
    "microsoft/Phi-3-small-8k-instruct": 0,
    "microsoft/Phi-3-mini-4k-instruct": 0,
    "microsoft/Phi-3.5-mini-instruct": 0,
    "bigscience/bloom-7b1": 0,
    "Tower-Babel/Babel-9B-Chat": 0.567
}

tokenizer_mmmlu = {
    "google/gemma-2-2b-it": 0,
    "google/gemma-2-9b-it": 0.596,
    "google/gemma-3-12b-it": 0,
    "meta-llama/Llama-3.2-1B-Instruct": 0,
    "meta-llama/Llama-3.2-3B-Instruct": 0,
    "meta-llama/Llama-3.1-8B-Instruct": 0.506,
    "microsoft/phi-4": 0,
    "microsoft/Phi-4-mini-instruct": 0,
    "microsoft/Phi-3-small-8k-instruct": 0,
    "microsoft/Phi-3-mini-4k-instruct": 0,
    "microsoft/Phi-3.5-mini-instruct": 0,
    "bigscience/bloom-7b1": 0,
    "Tower-Babel/Babel-9B-Chat": 0.598
}

## Preprocessing

### Fertility

In [None]:
# SENTENCE -> WORD

### English
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
# nltk.download('punkt_tab', download_dir='/work/hyujang/miniconda3/envs/thesis/nltk_data')
# nltk.download('punkt', download_dir='/work/hyujang/miniconda3/envs/thesis/nltk_data')
# nltk.data.path.append('/work/hyujang/miniconda3/envs/thesis/nltk_data')
df['English_nltk'] = df['English'].apply(lambda x: word_tokenize(x, language='english'))
# import spacy
# nlp = spacy.load('en_core_web_sm')
# df['English_spacy'] = df['English'].apply(lambda text: [token.text for token in nlp(text)])

### Korean
# from kiwipiepy import Kiwi
# kiwi = Kiwi()
# df['Korean_kiwi'] = df['Korean'].apply(lambda x: [token[0] for token in kiwi.tokenize(x)])
from konlpy.tag import Okt # TODO: Mecab
okt = Okt()
df['Korean_okt'] = df['Korean'].apply(lambda x: okt.morphs(x))

### German
df['German_nltk'] = df['German'].apply(lambda x: word_tokenize(x, language='german'))

lang_wordinzer = {"English": "nltk", "Korean": "okt", "German": "nltk"}

In [None]:
def calculate_subword_fertility(words_list, tokenizer):
    token_lengths = [len(tokenizer.tokenize(word)) for word in words_list]
    # return np.mean(token_lengths)
    return token_lengths

for tokenizer_name, token in tokenizer_configs.items():
    try:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True, token=token, trust_remote_code=True)
        print(f"Loaded {tokenizer_name}")
        for lang_name in languages.values():
            df[f"{tokenizer_name}_{lang_name}_tokens"] = df[lang_name].apply(lambda x: tokenizer.tokenize(x)) 
            # df[f"{tokenizer_name}_{lang_name}_token_length"] = df[f"{tokenizer_name}_{lang_name}_tokens"].apply(len)
            # df[f"{tokenizer_name}_{lang_name}_fertility"]= df[f"{lang_name}_{lang_wordinzer[lang_name]}"].apply(lambda x: calculate_subword_fertility(x, tokenizer))
            df[f"{tokenizer_name}_{lang_name}_token_len"]= df[f"{lang_name}_{lang_wordinzer[lang_name]}"].apply(lambda x: calculate_subword_fertility(x, tokenizer))
    except Exception as e:
        print(f"Failed to load tokenizer {tokenizer_name}: {e}")

# df.to_csv("./output/tokenizers_comparison.csv", index=False)

In [None]:
for lang_name in lang_wordinzer.keys():
    print(lang_name)
    print(len(df[f"{lang_name}_{lang_wordinzer[lang_name]}"].sum()))

## Plotting

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

save = False

language_colors = {
    "English": "#1f77b4",
    "German": "#2ca02c",
    "Korean": "#ff7f0e",
}

# Create subplots for each language
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)

for ax, (lang, color) in zip(axes, language_colors.items()):
    # Extract average subword fertility for each tokenizer
    avg_fertility = {
        tokenizer_name: df[f"{tokenizer_name}_{lang}_fertility"].mean()
        for tokenizer_name in tokenizer_configs.keys()
    }

    tokenizer_names = list(tokenizer_configs.keys())
    tokenizer_names = [tokenizer_name.split("/")[1] for tokenizer_name in tokenizer_names]
    x = np.arange(len(tokenizer_names))  # Position for bars

    # Plot Avg Subword Fertility
    ax.bar(x, avg_fertility.values(), color=color, alpha=0.7)
    ax.set_ylabel("Avg Subword Fertility")
    ax.set_xticks(x)
    ax.set_xticklabels(tokenizer_names, rotation=90)
    ax.set_title(f"Avg Subword Fertility for {lang}")

# Add common x-axis label
fig.text(0.5, 0.04, "Tokenizers", ha="center", fontsize=12)

# Adjust layout
plt.tight_layout()

if save:
    save_path = os.path.join("output/image", "avg_fertility_subplots.png")
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    print(f"Saved: {save_path}")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

save = False

# Keep a stable language order from your colors dict
languages = list(language_colors.keys())  # ["English", "German", "Korean"]

# Tokenizer names: full keys for data access, short names for labels
tokenizer_names_full = list(tokenizer_configs.keys())
tokenizer_names_disp = [name.split("/")[-1] for name in tokenizer_names_full]

# Build fertility matrix: rows = tokenizers, cols = languages
fertility_matrix = np.array([
    [
        # df[f"{tok}_{lang}_fertility"].mean()
        sum(df[f"{tok}_{lang}_token_len"].sum()) / len(df[f"{tok}_{lang}_token_len"].sum())
        for lang in languages
    ]
    for tok in tokenizer_names_full
])  # shape: (num_tokenizers, num_languages)

# X positions for each tokenizer group
x = np.arange(len(tokenizer_names_disp))
num_langs = len(languages)
bar_width = 0.8 / num_langs  # total group width ~0.8

fig, ax = plt.subplots(figsize=(10, 6))

# Plot grouped bars: one loop per language (offset per language)
for j, lang in enumerate(languages):
    vals = fertility_matrix[:, j]
    ax.bar(
        x + j * bar_width,
        vals,
        width=bar_width,
        label=lang,
        color=language_colors.get(lang, None),
        alpha=0.9,
        edgecolor="white",
        linewidth=0.5,
    )

# Labels, ticks, legend
ax.set_ylabel("Avg Subword Fertility")
ax.set_xlabel("Tokenizers")
# ax.set_title("Average Subword Fertility by Language")
ax.set_xticks(x + (num_langs - 1) * bar_width / 2)
ax.set_xticklabels(tokenizer_names_disp, rotation=45, ha="right")
ax.legend(title="Language")
ax.grid(axis="y", linestyle="--", alpha=0.3)

plt.tight_layout()

if save:
    save_path = os.path.join("output/image", "avg_fertility_grouped.png")
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    print(f"Saved: {save_path}")

plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os

# Example data (replace these with your actual data)
sorted_mmlupro = [tokenizer_mmlupro[tok] for tok in tokenizer_configs.keys()]
sorted_flores = [tokenizer_flores[tok] for tok in tokenizer_configs.keys()]
sorted_mmmlu = [tokenizer_mmmlu[tok] for tok in tokenizer_configs.keys()]

tokenizer_names = list(tokenizer_configs.keys())
x = np.arange(len(tokenizer_names))  # Position for groups

# Bar width (ensures bars don't overlap)
bar_width = 0.25  

# Create the bar plot
fig, ax = plt.subplots(figsize=(12, 6))

# Plot MMLUPro
ax.bar(x - bar_width, sorted_mmlupro, width=bar_width, color="green", label="MMLUPro", alpha=0.7)

# Plot Flores
ax.bar(x, sorted_flores, width=bar_width, color="blue", label="Flores", alpha=0.7)

# Plot MMMLU
ax.bar(x + bar_width, sorted_mmmlu, width=bar_width, color="orange", label="MMMLU", alpha=0.7)

# Add labels, title, and legend
ax.set_ylabel("Scores")
ax.set_xlabel("Tokenizers")
ax.set_title("Comparison of MMLUPro, Flores, and MMMLU Scores per Tokenizer")
ax.set_xticks(x)
ax.set_xticklabels(tokenizer_names, rotation=90)
ax.legend()

# Save the plot if needed
save = True
if save:
    save_path = os.path.join("output/image", "mmlu_flores_mmmlu_comparison.png")
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    print(f"Saved: {save_path}")

# Show the plot
plt.tight_layout()
plt.show()

- issue: no single source that compares many models for a dataset
- final decision: gemma3-12-it, babel-9b-chat

## Inspect Encoding Issue

In [None]:
df = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/output/tokenizers_comparison.csv")
df.loc[:, df.columns.str.contains("Korean", case=False)]
# df['microsoft/Phi-4-mini-instruct_Korean_tokens'].tolist() # t
# import unicodedata
# text = unicodedata.normalize("NFKC", df['microsoft/Phi-4-mini-instruct_Korean_tokens'][0][2])
# text

In [None]:
df.loc[:, df.columns.str.contains("German", case=False)]


In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", use_fast=True, token=token_2, trust_remote_code=True)

print(tokenizer.tokenize("안녕하세요"))
a = tokenizer.encode("안녕하세요")
print(a)
print(tokenizer.decode(a))
tokens = tokenizer.convert_ids_to_tokens(a)
print(tokens)
decoded_tokens = " ".join(tokens)
print(decoded_tokens)
print('ìķĪ'.encode("utf-8"))