In [None]:
import glob
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import pyrootutils
import seaborn as sns
from tokenizers import Tokenizer

from src.tokenizer_metrics import (
    AlignmentWithCDI,
    AverageTokenLength,
    CorrespondenceWithMorphemes,
    CorrespondenceWithWords,
    SingleTokenizerMetric,
    SplitsIntoMorphemes,
    SplitsOnSpace,
    TokenizerOverlap,
)

In [None]:
PROJECT_ROOT = path = pyrootutils.find_root(
    search_from=os.path.abspath(""), indicator=".project-root"
)

In [None]:
%matplotlib inline

In [None]:
def get_tokenizers_from_dir(dir: Path, name: str = None):
    partial_path = PROJECT_ROOT / dir
    glob_path = partial_path / "[0-9]*-tokenizer.json"
    n_tokenizers = len(glob.glob(str(glob_path)))

    tokenizers = []
    for i in range(n_tokenizers):
        tokenizer_base = Tokenizer.from_file(str(partial_path / f"{i}-tokenizer.json"))
        tokenizer_base.name = f"{name}-{i}"
        tokenizers.append(tokenizer_base)

    return tokenizers

In [None]:
tokenizers = {}

tokenizers["LM-10000x20"] = get_tokenizers_from_dir(
    "outputs/2024-05-04-221644_5b2e",
    name="LM-10000x20",
)

tokenizers["LM-10000x20-nosplitting"] = get_tokenizers_from_dir(
    "outputs/2024-05-04-233927_9728",
    name="LM-10000x20-nosplitting",
)

tokenizers["LM-10000x20-splitting"] = get_tokenizers_from_dir(
    "outputs/2024-05-05-005805_dcb3",
    name="LM-10000x20-splitting",
)

tokenizers["BPE-retrain-10000x20"] = get_tokenizers_from_dir(
    "outputs/bpe-tokenizers/2024-05-04-232043_a9cd",
    name="BPE-retrain-10000x20",
)

tokenizers["BPE-noretrain-10000x20"] = get_tokenizers_from_dir(
    "outputs/bpe-tokenizers/2024-05-04-232345_4c17",
    name="BPE-noretrain-10000x20",
)

tokenizers["BPE-noretrain-10000x20-nosplitting"] = get_tokenizers_from_dir(
    "outputs/bpe-tokenizers/2024-05-04-232529_dd8e",
    name="BPE-noretrain-10000x20-nosplitting",
)

# tokenizers["BPE 10 splits no retrain"] = get_tokenizers_from_dir(
#     # "outputs/bpe-tokenizers/2024-05-04-121353_5821", name="BPE 10 splits no retrain"
#     # "outputs/bpe-tokenizers/2024-05-04-140418_3016", name="BPE 10 splits no retrain"
#     "outputs/bpe-tokenizers/2024-05-04-144627_4d11",
#     name="BPE 10 splits no retrain",
# )

# tokenizers["BPE 10 splits retrain"] = get_tokenizers_from_dir(
#     # "outputs/bpe-tokenizers/2024-05-04-121534_3769", name="BPE 10 splits retrain"
#     # "outputs/bpe-tokenizers/2024-05-04-140213_96d4", name="BPE 10 splits retrain"
#     "outputs/bpe-tokenizers/2024-05-04-144527_93b6",
#     name="BPE 10 splits retrain",
# )

# tokenizers["Nospace"] = get_tokenizers_from_dir(
#     # "outputs/bpe-tokenizers/2024-05-04-091551_b89e", suffix=True
#     # "outputs/bpe-tokenizers/2024-05-04-142555_cc5e", name="Nospace"
#     "outputs/bpe-tokenizers/2024-05-04-144603_489b",
#     name="Nospace",
# )

# tokenizers["AGGHGHG"] = get_tokenizers_from_dir(
#     # "outputs/2024-05-04-115111_0a5e", name="AGGHGHG"
#     "outputs/2024-05-04-122039_2270",
#     name="AGGHGHG",
# )

print("Vocab Sizes:")
for name, ts in tokenizers.items():
    print(name, len(ts), [t.get_vocab_size() for t in ts])

In [None]:
multitokenizer_metrics = [AlignmentWithCDI, TokenizerOverlap]
metric_names = [
    tokenizer_metric.__qualname__ for tokenizer_metric in multitokenizer_metrics
]
scores = {}
for tokenizer_type, tokenizer_list in tokenizers.items():
    scores[tokenizer_type] = [
        metric(tokenizer_list).calculate() for metric in multitokenizer_metrics
    ]

In [None]:
graph_data = []
for tknzr in scores.keys():
    for mi in range(len(metric_names)):
        graph_data.append(
            {
                "tokenizer": tknzr,
                "metric": metric_names[mi],
                "score": scores[tknzr][mi],
            }
        )
graph_data_df = pd.DataFrame(graph_data)
graph_data_df

In [None]:
sns.catplot(data=graph_data_df, kind="bar", x="metric", y="score", hue="tokenizer")

In [None]:
def plot_comparison(metric: dict[str, float], metric_name: str):
    plt.figure()
    plt.clf()
    for tok_type, values in metric.items():
        plt.plot(values, label=tok_type)
    plt.ylabel("Score")
    plt.xlabel("Tokenizer Iteration")
    plt.title(metric_name)
    plt.legend()
    plt.show()


def calculate_and_plot(metric_name: SingleTokenizerMetric, **args):
    values = {}
    for tok_type, tok_list in tokenizers.items():
        scores = []
        for tok in tok_list:
            x = metric_name(tok, **args)
            scores.append(x.calculate())
        values[tok_type] = scores
    plot_comparison(values, metric_name=metric_name.__qualname__)

In [None]:
# percentage of spaces that correspond to token boundaries
scores = []
for tok in tokenizers["BPE-noretrain-1000x20-nosplitspace"]:
    x = SplitsOnSpace(tok)
    scores.append(x.calculate())
plot_comparison({"no space": scores}, metric_name=SplitsOnSpace.__name__)

In [None]:
# percentage of tokenized boundaries that correspond to actual spaces
scores = []
for tok in tokenizers["Nospace"]:
    x = SplitsOnSpace(tok, baseline="tokenized")
    scores.append(x.calculate())
plot_comparison({"no space": scores}, metric_name=SplitsOnSpace.__name__)

In [None]:
calculate_and_plot(AverageTokenLength)

In [None]:
calculate_and_plot(CorrespondenceWithMorphemes)

In [None]:
calculate_and_plot(CorrespondenceWithWords)

In [None]:
calculate_and_plot(SplitsIntoMorphemes, metric="count")

In [None]:
calculate_and_plot(SplitsIntoMorphemes, metric="distance")