In [None]:
import sys

sys.path.append("..")

import glob
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import pyrootutils
import seaborn as sns
from tokenizers import Tokenizer

from src.tokenizer_metrics import (
    AlignmentWithCDI,
    AverageTokenLength,
    CorrespondenceWithMorphemes,
    CorrespondenceWithWords,
    SingleTokenizerMetric,
    SplitsIntoMorphemes,
    SplitsOnSpace,
    TokenizerOverlap,
)

In [None]:
PROJECT_ROOT = path = pyrootutils.find_root(
    search_from=os.path.abspath(""), indicator=".project-root"
)

In [None]:
%matplotlib inline

In [None]:
def get_tokenizers_from_dir(dir: Path, name: str = None):
    partial_path = PROJECT_ROOT / dir
    glob_path = partial_path / "[0-9]*-tokenizer.json"
    n_tokenizers = len(glob.glob(str(glob_path)))

    tokenizers = []
    for i in range(n_tokenizers):
        tokenizer_base = Tokenizer.from_file(str(partial_path / f"{i}-tokenizer.json"))
        tokenizer_base.name = f"{name}-{i}"
        tokenizers.append(tokenizer_base)

    return tokenizers

In [None]:
tokenizers = {}

tkz_dict = {
    "LM-merge-1": "outputs/2024-05-06-125114_93f0",
    "LM-nomerge-1": "outputs/2024-05-06-125115_9621",
    "LM-merge-2": "outputs/2024-05-06-125411_bfc9",
    "LM-nomerge-2": "outputs/2024-05-06-125411_90aa",
    "LM-merge-3": "outputs/2024-05-06-125412_5c63",
    "LM-nomerge-3": "outputs/2024-05-06-125411_10c9",
    "LM-merge-4": "outputs/2024-05-06-125411_e5c5",
    "LM-nomerge-4": "outputs/2024-05-06-125411_e6e7",
    "LM-merge-5": "outputs/2024-05-06-125411_1d73",
    "BPE-nomerge-retrain": "outputs/bpe-tokenizers/2024-05-10-114128_b1a1",
    "BPE-nomerge-noretrain": "outputs/bpe-tokenizers/2024-05-10-115123_88af",
    "BPE-merge-retrain": "outputs/bpe-tokenizers/2024-05-10-120039_8a27",
    "BPE-merge-noretrain": "outputs/bpe-tokenizers/2024-05-10-123454_58e0",
}

for name, dir in tkz_dict.items():
    tokenizers[name] = get_tokenizers_from_dir(dir, name=name)

tkz_info = []
for name, ts in tokenizers.items():
    tkz_info.append(
        {
            "name": name,
            "n_epochs": len(ts),
            "final_vocab_size": ts[-1].get_vocab_size(),
            "merge_across_spaces": "nomerge" not in name,
        }
    )

tkz_info_df = pd.DataFrame(tkz_info)
tkz_info_df

In [None]:
tkz_info_df.groupby("merge_across_spaces")[["final_vocab_size", "n_epochs"]].mean()

In [None]:
multitokenizer_metrics = [AlignmentWithCDI, TokenizerOverlap]
metric_names = [
    tokenizer_metric.__qualname__ for tokenizer_metric in multitokenizer_metrics
]
scores = {}
for tokenizer_type, tokenizer_list in tokenizers.items():
    scores[tokenizer_type] = [
        metric(tokenizer_list).calculate() for metric in multitokenizer_metrics
    ]

In [None]:
graph_data = []
for tknzr in scores.keys():
    for mi in range(len(metric_names)):
        graph_data.append(
            {
                "tokenizer": tknzr,
                "metric": metric_names[mi],
                "score": scores[tknzr][mi],
            }
        )
graph_data_df = pd.DataFrame(graph_data)
graph_data_df["merge_across_spaces"] = ~graph_data_df["tokenizer"].str.contains(
    "nomerge"
)

graph_data_df

In [None]:
sns.catplot(
    data=graph_data_df, kind="bar", x="metric", y="score", hue="merge_across_spaces"
)

In [None]:
def plot_comparison(metric: dict[str, float], metric_name: str):
    plt.figure()
    plt.clf()
    for tok_type, values in metric.items():
        plt.plot(values, label=tok_type)
    plt.ylabel("Score")
    plt.xlabel("Tokenizer Iteration")
    plt.title(metric_name)
    plt.legend()
    plt.show()


def calculate_and_plot(metric_name: SingleTokenizerMetric, **args):
    values = {}
    for tok_type, tok_list in tokenizers.items():
        scores = []
        for tok in tok_list:
            x = metric_name(tok, **args)
            scores.append(x.calculate())
        values[tok_type] = scores
    plot_comparison(values, metric_name=metric_name.__qualname__)

In [None]:
# percentage of spaces that correspond to token boundaries
scores = []
for tok in tokenizers["BPE-noretrain-1000x20-nosplitspace"]:
    x = SplitsOnSpace(tok)
    scores.append(x.calculate())
plot_comparison({"no space": scores}, metric_name=SplitsOnSpace.__name__)

In [None]:
# percentage of tokenized boundaries that correspond to actual spaces
scores = []
for tok in tokenizers["Nospace"]:
    x = SplitsOnSpace(tok, baseline="tokenized")
    scores.append(x.calculate())
plot_comparison({"no space": scores}, metric_name=SplitsOnSpace.__name__)

In [None]:
calculate_and_plot(AverageTokenLength)

In [None]:
calculate_and_plot(CorrespondenceWithMorphemes)

In [None]:
calculate_and_plot(CorrespondenceWithWords)

In [None]:
calculate_and_plot(SplitsIntoMorphemes, metric="count")

In [None]:
calculate_and_plot(SplitsIntoMorphemes, metric="distance")