# General Setup & Helper Functions

In [1]:
from dataclasses import dataclass
from typing import Final, Iterable, Optional, Sequence, Union

import numpy as np
from scipy.stats import shapiro, ttest_1samp, ttest_ind  # t

In [2]:
ALTERNATIVES = ("two-sided", "less", "greater")
ALPHA = 0.05

In [3]:
@dataclass
class Sample:
    """Simple class to hold an array of values, their mean and standard deviation."""

    values: Sequence[float]
    array: np.ndarray = None
    ddof: int = 1
    mean: float = None
    std: float = None
    variance: float = None

    def __post_init__(self):
        self.array = np.array(self.values)
        self.mean, self.std, self.var = self.get_basic_stats()

    def get_basic_stats(self) -> tuple[float, float, float]:
        mean = np.mean(self.array)
        std = np.std(self.array, ddof=self.ddof)
        var = np.var(self.array, ddof=self.ddof)
        return mean, std, var

    def __len__(self):
        return len(self.array)

    def __repr__(self):
        return f"Sample({self.values})"

    def __str__(self):
        return f"{self.array}"


MetricValue = Union[float, Sample]
Metric = dict[str, MetricValue]
TokenizerResults = dict[str, Metric]


def build_tokenizer_results(
    tokenizer_names: Sequence[str],
    **kwargs,
) -> TokenizerResults:
    """Helper function to build a data structure from raw experiment data."""
    results: TokenizerResults = {}

    for metric_name, metric in kwargs.items():
        assert len(metric) == len(
            tokenizer_names
        ), f"Incorrect number of metrics for {metric_name}"
        assert isinstance(metric, dict), f"Incorrect metric {metric} in {metric_name}"
        for tokenizer_name, metric_value in metric.items():
            assert (
                tokenizer_name in tokenizer_names
            ), f"Incorrect tokenizer {tokenizer_name} in {metric_name}"
            assert isinstance(metric_value, float) or isinstance(
                metric_value, Sample
            ), f"Incorrect metric value {metric_value} in {metric_name}"

    for tokenizer_name in tokenizer_names:
        results[tokenizer_name] = {}

    for metric_name, metric in kwargs.items():
        for tokenizer_name, metric_value in metric.items():
            results[tokenizer_name][metric_name] = metric_value

    return results


def get_mean_value(metric_value: MetricValue) -> float:
    """Helper function to return either the mean of a Sample or the value itself if float."""
    if isinstance(metric_value, float):
        return metric_value
    elif isinstance(metric_value, Sample):
        return metric_value.mean
    else:
        pass
        # Commented out the TypeError because for unknown reasons a Sample was not considered a sample (was of type "__main__.Sample")
        # raise TypeError(f"metric_value must be either float or Sample, but has type {type(metric_value)}")


def print_basic_stats(
    results: TokenizerResults, sort_by: str = "fcd", reverse: bool = False
):
    """Helper function to print sorted mean (and standard deviation, if available) of tokenizer results."""
    reverse_string = "descending" if reverse else "ascending"
    print(
        f"Printing basic stats for {len(results)} tokenizers sorted by {sort_by.upper()} in {reverse_string.upper()} order..."
    )
    # for tokenizer_name in sorted(
    #         results, key=lambda x: get_mean_value(results[x][sort_by]), reverse=reverse
    # ):
    for tokenizer_name, metrics in sorted(
        results.items(), key=lambda x: get_mean_value(x[1][sort_by]), reverse=reverse
    ):
        print(f"\n*** {tokenizer_name.upper()} ***")
        for metric_name, metric_value in metrics.items():
            if isinstance(metric_value, float):
                print(f"Metric: {metric_name:15s} Single value: {metric_value:.3f}")
            elif isinstance(metric_value, Sample):
                print(
                    f"Metric: {metric_name:15s} Mean:         {metric_value.mean:.3f}   Std.dev. {metric_value.std:.3f}"
                )
            else:
                raise TypeError("metric must be a dict[str, dict[float|Sample]]")


def get_best_tokenizer(
    results: TokenizerResults,
    sort_by: str = "fcd",
    reverse: bool = False,
    with_sample: bool = True,
) -> tuple[str, MetricValue]:
    """Helper function to determine the best performing tokenizer."""
    best_tokenizers = sorted(
        results, key=lambda x: get_mean_value(results[x][sort_by]), reverse=reverse
    )
    if not with_sample:
        best_tokenizer = best_tokenizers[-1]
    else:
        tokenizers_with_samples = get_tokenizers_with_sample(results, criterion=sort_by)
        i = -1
        while best_tokenizer := best_tokenizers[i]:
            if best_tokenizer in tokenizers_with_samples:
                break
            else:
                i -= 1

    metric_value = results[best_tokenizer][sort_by]
    return best_tokenizer, metric_value


def get_tokenizers_with_sample(
    results: TokenizerResults, criterion: str = "fcd"
) -> list[str]:
    """Helper function to determine tokenizers which have a Sample for a criterion (and not a float)."""
    result = []
    tokenizers = results.keys()
    for tokenizer in tokenizers:
        metric_value = results[tokenizer][criterion]
        if not isinstance(metric_value, float):  # (metric_value, Sample):
            result.append(tokenizer)

    return result


def get_fcd_guacamol(fcd: Metric) -> Metric:
    """Helper function to calculate the FCD in 'GuacaMol style'"""
    fcd_guacamol: Metric = {}
    for tokenizer_name, fcd_value in fcd.items():
        if isinstance(fcd_value, float):
            fcd_guacamol[tokenizer_name] = np.exp(-0.2 * fcd_value)
        elif isinstance(fcd_value, Sample):
            values = [np.exp(-0.2 * fcd) for fcd in fcd_value.values]
            fcd_guacamol[tokenizer_name] = Sample(values)
        else:
            raise TypeError("Wrong type for fcd")
    return fcd_guacamol


def print_comparison(
    tokenizer_name: str,
    # tokenizer_sample: Sample,
    results: TokenizerResults,
    criterion: str = "fcd",
    alternative: str = "two_sided",
    reverse: bool = False,
):
    """Compares tokenizer results and does the statistical tests."""
    reverse_string = "descending" if reverse else "ascending"
    print("\n*** Perform statistical tests ***")
    print(
        f"Compare metric {criterion.upper()} with other tokenizers (in {reverse_string} order)"
    )
    print(f"Alternative: {alternative.upper()}")
    tokenizer_sample = results[tokenizer_name][criterion]
    print(
        f"Best tokenizer: {tokenizer_name} with metric mean {tokenizer_sample.mean:.3f} and std.dev. {tokenizer_sample.std:.3f}"
    )
    for tokenizer, metrics in sorted(
        results.items(), key=lambda x: get_mean_value(x[1][criterion]), reverse=reverse
    ):
        metric = metrics[criterion]
        print(f"\n{tokenizer:20s} with metric mean {get_mean_value(metric):.3f}")
        t_stat, p_val = t_test(tokenizer_sample, metric, alternative)
        print(
            f"t-statistic: {t_stat:6.3f}, "
            f"p-value: {p_val:6.3f}, "
            f"Reject H0 (p<{ALPHA}): {p_val < ALPHA}"
        )

# Statistical Tests Setup

In [4]:
def is_normally_distributed(sample: Sample, alpha: float = ALPHA):
    """Perform Shapiro Test for normal distribution."""
    _, p_value = shapiro(sample.array)
    return p_value > alpha

In [5]:
def one_sample_t_test(
    sample: Sample, population_mean: float, alternative: str = "two_sided"
) -> tuple[float, float]:
    """Perform one sample Student's t-test (or Wilcoxon signed rank test)."""
    # n = len(sample)
    # df = n - 1
    # t_statistic = (sample.mean - population_mean) / (sample.std / np.sqrt(n))
    # p_value = _calculate_p_value(t_statistic, df, alternative)

    if not is_normally_distributed(sample):
        print("Warning: sample is not normally distributed!")
        # print("Therefore, we do a Wilcoxon signed-rank test instead of a Student's t-test!")
        # test_statistic, p_value = wilcoxon(sample.array - population_mean, alternative=alternative)
        # return test_statistic, p_value
    test_statistic, p_value = ttest_1samp(
        sample.array, population_mean, alternative=alternative
    )
    return test_statistic, p_value

In [6]:
def two_sample_t_test(
    sample: Sample, baseline: Sample, alternative: str = "two_sided"
) -> tuple[float, float]:
    """Perform one sample Student's t-test (or Welch's t-test)."""
    # n1 = len(sample)
    # n2 = len(baseline)
    if sample.std > 2.0 * baseline.std or baseline.std > 2.0 * sample.std:
        print("Warning: standard deviations differ by more than a factor of 2")
        print("Therefore, we do a Welch's t-test instead of a Student's t-test!")
        equal_var = False
    else:
        equal_var = True

    # pooled_std = np.sqrt(((n1 - 1) * sample.std ** 2 + (n2 - 1) * baseline.std ** 2) / (n1 + n2 - 2))
    # t_statistic = (sample.mean - baseline.mean) / (pooled_std * np.sqrt(1 / n1 + 1 / n2))
    # df = n1 + n2 - 2
    # p_value = _calculate_p_value(t_statistic, df, alternative)

    if not is_normally_distributed(sample):
        print("Warning: sample is not normally distributed!")
        # print("Therefore, we do a Wilcoxon signed-rank test instead of a Student's t-test!")
        # test_statistic, p_value = wilcoxon(sample.array - population_mean, alternative=alternative)
        # return test_statistic, p_value
    if not is_normally_distributed(baseline):
        print("Warning: baseline is not normally distributed!")
        # print("Therefore, we do a Wilcoxon signed-rank test instead of a Student's t-test!")
        # test_statistic, p_value = wilcoxon(sample.array - population_mean, alternative=alternative)
        # return test_statistic, p_value

    test_statistic, p_value = ttest_ind(
        sample.array, baseline.array, equal_var=equal_var, alternative=alternative
    )
    return test_statistic, p_value

In [7]:
def interpret_p_value(
    p_value: float,
    # alternative: str,
    alpha: float = ALPHA,
) -> str:
    """Helper function to determine if we can or can not reject the null hypothesis."""
    if p_value < alpha:
        return "Reject H0"
    else:
        return "Cannot reject H0"

In [8]:
def t_test(
    sample: Sample, comparison: Union[float, Sample], alternative: str = "two_sided"
):
    """Helper function to select either a one-sample or two-sample t-test, depending on whether we have a Sample or a float."""
    if isinstance(comparison, float):
        t_statistic, p_value = one_sample_t_test(sample, comparison, alternative)
    elif isinstance(comparison, Sample):
        t_statistic, p_value = two_sample_t_test(sample, comparison, alternative)
    else:
        raise ValueError("comparison must be a float or a Sample")

    return t_statistic, p_value

## Example Usage

In [9]:
print("Shapiro-Wilk test example")
sample = Sample([2.5, 3.1, 2.8, 3.4, 2.9, 3.0, 3.3, 2.6, 3.2, 3.1])
result = is_normally_distributed(sample)
print(f"Sample is normally distributed: {result}")

Shapiro-Wilk test example
Sample is normally distributed: True


In [10]:
print("One sample t-test example")
sample = Sample([10.0, 11.0, 12.0, 13.0, 14.0])
population_mean = 14.0
print(f"Sample: {sample}")
print(f"Population mean: {population_mean}")
for alternative in ALTERNATIVES:
    # t_stat, p_val = one_sample_t_test(sample, population_mean, alternative)
    t_stat, p_val = t_test(sample, population_mean, alternative)
    print(
        f"Alternative: {alternative:9s}, "
        f"t-statistic: {t_stat:.3f}, "
        f"p-value: {p_val:.3f}, "
        f"Reject H0 (p<0.05): {p_val < 0.05}"
    )

print("\nTwo sample t-test example")
sample = Sample(np.array([10.0, 11.0, 12.0]))
baseline = Sample(np.array([12.0, 13.0, 14.0]))
print(f"Sample: {sample}")
print(f"Baseline: {baseline}")
for alternative in ALTERNATIVES:
    # t_stat, p_val = two_sample_t_test(sample, baseline, alternative)
    t_stat, p_val = t_test(sample, baseline, alternative)
    print(
        f"Alternative: {alternative:9s}, "
        f"t-statistic: {t_stat:.3f}, "
        f"p-value: {p_val:.3f}, "
        f"Reject H0 (p<0.05): {p_val < 0.05}"
    )

One sample t-test example
Sample: [10. 11. 12. 13. 14.]
Population mean: 14.0
Alternative: two-sided, t-statistic: -2.828, p-value: 0.047, Reject H0 (p<0.05): True
Alternative: less     , t-statistic: -2.828, p-value: 0.024, Reject H0 (p<0.05): True
Alternative: greater  , t-statistic: -2.828, p-value: 0.976, Reject H0 (p<0.05): False

Two sample t-test example
Sample: [10. 11. 12.]
Baseline: [12. 13. 14.]
Alternative: two-sided, t-statistic: -2.449, p-value: 0.070, Reject H0 (p<0.05): False
Alternative: less     , t-statistic: -2.449, p-value: 0.035, Reject H0 (p<0.05): True
Alternative: greater  , t-statistic: -2.449, p-value: 0.965, Reject H0 (p<0.05): False


# Experiment Results

## Guacamol/SMILES

### The initial "best" tokenizer

This is based on a single run for each tokenizer.
A single run consists of:

- a training with fixed seed
- a generation with fixed seed
- FCD as the metric

Based on this setup, char_wordpiece_176 came up as the initial "best" tokenizer. However, there are a number of additional "candidate" tokenizers which did not show a statistical significant difference, i.e. we could not reject the null hypothesis. We than ran 5 runs with each of those "candidate" tokenizers to assess their mean performance and standard deviation.


### Statistical tests for the "canidate" tokenizers

This is based on 5 runs for the "candidate" tokenizers.
The 5 runs are:

- a training with random seed
- a generation with fixed seed
- FCD as the metric

At this point we have 5 FCD values for each of the "candidate" tokenizers. We define the final "best" tokenizer as the winner and perform statistical tests again. This results in a list of tokenizers which show no statistically significant difference in performance and remaining tokenizers for which we can reject the null hypothesis.


### Tokenizer Summary

- The initial "best" tokenizer: char_wordpiece_176
- The "candidate" tokenizers are: char_bpe_88, char_bpe_176, char_unigram_88
- The final "best" tokenizer is: char_wordpiece_176

In [11]:
# Gather data from experiments and get basic stats

GUACAMOL_TOKENIZERS: Final[set[str]] = {
    "char_wordlevel_38",
    "char_bpe_44",
    "char_bpe_88",
    "char_bpe_176",
    "char_wordpiece_88",
    "char_wordpiece_176",
    "char_unigram_44",
    "char_unigram_88",
    "char_unigram_176",
    "atom_wordlevel_50",
    "smarts_wordlevel_106",
}
validity: Metric = {
    "char_wordlevel_38": 0.9822,
    "char_bpe_44": 0.9826,
    "char_bpe_88": Sample([0.9768, 0.9781, 0.9763, 0.9746, 0.9763]),
    "char_bpe_176": Sample([0.9769, 0.9715, 0.9753, 0.9753, 0.9754]),
    "char_wordpiece_88": 0.986,
    "char_wordpiece_176": Sample([0.9779, 0.9769, 0.9759, 0.9757, 0.9753]),
    "char_unigram_44": 0.9833,
    "char_unigram_88": Sample([0.9759, 0.9767, 0.9738, 0.9781, 0.9774]),
    "char_unigram_176": 0.9756,
    "atom_wordlevel_50": 0.9805,
    "smarts_wordlevel_106": 0.9796,
}
uniqueness: Metric = {
    "char_wordlevel_38": 0.9997963754836082,
    "char_bpe_44": 0.9990840626908203,
    "char_bpe_88": Sample(
        [
            0.9987714987714987,
            0.999386565790819,
            0.9998975724674792,
            0.9995895752103428,
            0.9990781522073133,
        ]
    ),
    "char_bpe_176": Sample(
        [
            0.9992834476404955,
            0.9997941327843541,
            0.9988721419050548,
            0.9989746744591408,
            0.9992823457043264,
        ]
    ),
    "char_wordpiece_88": 0.9989858012170385,
    "char_wordpiece_176": Sample(
        [
            0.9986706207178648,
            0.9992834476404955,
            0.9995901219387232,
            0.9994875474018653,
            0.9993848046754845,
        ]
    ),
    "char_unigram_44": 0.9992881114614055,
    "char_unigram_88": Sample(
        [
            0.9990777743621273,
            0.9993856864953414,
            0.9995892380365579,
            0.999591043860546,
            0.9996930632289748,
        ]
    ),
    "char_unigram_176": 0.9996924969249692,
    "atom_wordlevel_50": 0.9990821009688934,
    "smarts_wordlevel_106": 0.9991833401388321,
}
novelty: Metric = {
    "char_wordlevel_38": 0.9467413441955194,
    "char_bpe_44": 0.9446877864928186,
    "char_bpe_88": Sample(
        [
            0.9407544075440755,
            0.9358567774936062,
            0.9337225978283139,
            0.9397454321494559,
            0.9347959811359442,
        ]
    ),
    "char_bpe_176": Sample(
        [
            0.9302397049784881,
            0.9310202820961598,
            0.9360500923834941,
            0.9343118136097711,
            0.9348517492561814,
        ]
    ),
    "char_wordpiece_88": 0.9361421319796954,
    "char_wordpiece_176": Sample(
        [
            0.9376407945934876,
            0.933620159803319,
            0.9340850845720143,
            0.9366283839212469,
            0.9352621319380322,
        ]
    ),
    "char_unigram_44": 0.9384286586606961,
    "char_unigram_88": Sample(
        [
            0.936,
            0.941399446777994,
            0.939284980480789,
            0.9391428863659609,
            0.9359328625524511,
        ]
    ),
    "char_unigram_176": 0.9366348815749,
    "atom_wordlevel_50": 0.9432421396488363,
    "smarts_wordlevel_106": 0.9406416019615856,
}
fcd: Metric = {
    "char_wordlevel_38": 0.22573631123455584,
    "char_bpe_44": 0.23582004914318588,
    "char_bpe_88": Sample(
        [
            0.22660485100020367,
            0.2283699198492286,
            0.21916330554410024,
            0.21487186290418947,
            0.22354561820675656,
        ]
    ),
    "char_bpe_176": Sample(
        [
            0.22267958128769294,
            0.2250078470330834,
            0.21600954762025992,
            0.21789636102157317,
            0.21858163757028137,
        ]
    ),
    "char_wordpiece_88": 0.24258628303761043,
    "char_wordpiece_176": Sample(
        [
            0.21138082989205031,
            0.22424112983600253,
            0.22342369151375863,
            0.21396691344180852,
            0.22290331624026294,
        ]
    ),
    "char_unigram_44": 0.22958462926293066,
    "char_unigram_88": Sample(
        [
            0.22215763836744884,
            0.21625658903371914,
            0.21653028109176375,
            0.22646688628489642,
            0.23060648678460893,
        ]
    ),
    "char_unigram_176": 0.2323379188153183,
    "atom_wordlevel_50": 0.23930838874149174,
    "smarts_wordlevel_106": 0.24132050338971567,
}

fcd_guacamol = get_fcd_guacamol(fcd)

guacamol_tokenizers = build_tokenizer_results(
    GUACAMOL_TOKENIZERS,
    validity=validity,
    uniqueness=uniqueness,
    novelty=novelty,
    fcd=fcd,
    fcd_guacamol=fcd_guacamol,
)

print_basic_stats(guacamol_tokenizers, sort_by="fcd", reverse=True)

Printing basic stats for 11 tokenizers sorted by FCD in DESCENDING order...

*** CHAR_WORDPIECE_88 ***
Metric: validity        Single value: 0.986
Metric: uniqueness      Single value: 0.999
Metric: novelty         Single value: 0.936
Metric: fcd             Single value: 0.243
Metric: fcd_guacamol    Single value: 0.953

*** SMARTS_WORDLEVEL_106 ***
Metric: validity        Single value: 0.980
Metric: uniqueness      Single value: 0.999
Metric: novelty         Single value: 0.941
Metric: fcd             Single value: 0.241
Metric: fcd_guacamol    Single value: 0.953

*** ATOM_WORDLEVEL_50 ***
Metric: validity        Single value: 0.981
Metric: uniqueness      Single value: 0.999
Metric: novelty         Single value: 0.943
Metric: fcd             Single value: 0.239
Metric: fcd_guacamol    Single value: 0.953

*** CHAR_BPE_44 ***
Metric: validity        Single value: 0.983
Metric: uniqueness      Single value: 0.999
Metric: novelty         Single value: 0.945
Metric: fcd             Sin

In [21]:
criteria = ["validity", "uniqueness", "novelty", "fcd"]
for criterion in criteria:
    reverse = True if criterion == "fcd" else False
    alternative = "less" if criterion == "fcd" else "greater"
    best_tokenizer, _ = get_best_tokenizer(
        guacamol_tokenizers, sort_by=criterion, reverse=reverse
    )
    print_comparison(
        best_tokenizer,
        guacamol_tokenizers,
        criterion=criterion,
        alternative=alternative,
        reverse=reverse,
    )


*** Perform statistical tests ***
Compare metric VALIDITY with other tokenizers (in ascending order)
Alternative: GREATER
Best tokenizer: char_bpe_88 with metric mean 0.976 and std.dev. 0.001

char_bpe_176         with metric mean 0.975
t-statistic:  1.454, p-value:  0.092, Reject H0 (p<0.05): False

char_unigram_176     with metric mean 0.976
t-statistic:  1.460, p-value:  0.109, Reject H0 (p<0.05): False

char_wordpiece_176   with metric mean 0.976
t-statistic:  0.109, p-value:  0.458, Reject H0 (p<0.05): False

char_unigram_88      with metric mean 0.976
t-statistic:  0.043, p-value:  0.483, Reject H0 (p<0.05): False

char_bpe_88          with metric mean 0.976
t-statistic:  0.000, p-value:  0.500, Reject H0 (p<0.05): False

smarts_wordlevel_106 with metric mean 0.980
t-statistic: -5.662, p-value:  0.998, Reject H0 (p<0.05): False

atom_wordlevel_50    with metric mean 0.981
t-statistic: -7.265, p-value:  0.999, Reject H0 (p<0.05): False

char_wordlevel_38    with metric mean 0.982

### Comparison with GuacaMol

Regarding the FCD, we char_wordpiece_176 to be the "best" tokenizer (with other tokenizers for which we can not reject the null hypothesis). We now compare it with the Guacamol metrics.

In [22]:
best_tokenizer, value = get_best_tokenizer(
    guacamol_tokenizers, sort_by="fcd", reverse=True
)
assert (
    best_tokenizer == "char_wordpiece_176"
), "The best tokenizer has changed, check code/numbers!"

guacamol_original = 0.455  # taken from GuacaMol paper

print(
    f"Selected tokenizer {best_tokenizer} has a mean of {value.mean:.3f} and a std.dev. of {value.std:.3f}"
)

alternative = "less"
print("\nCompare with single FCD value of GuacaMol paper")
print(f"Perform statistical test... (alternative: {alternative})")

t_stat, p_val = t_test(value, guacamol_original, alternative)
print(
    f"t-statistic: {t_stat:.3f}, "
    f"p-value: {p_val:.3f}, "
    f"Reject H0 (p<0.05): {p_val < ALPHA}"
)

Selected tokenizer char_wordpiece_176 has a mean of 0.219 and a std.dev. of 0.006

Compare with single FCD value of GuacaMol paper
Perform statistical test... (alternative: less)
t-statistic: -87.432, p-value: 0.000, Reject H0 (p<0.05): True


## USPTO50K/SMARTS

We follow the same procedure as outlined in the GuacaMol section.
- The initial "best" tokenizer: char_wordlevel_47
- The "candidate" tokenizers are: char_bpe_47, char_bpe_88, atom_wordlevel_86, smarts_wordlevel_947
- The final "best" tokenizer is: smarts_wordlevel_947

In [17]:
USPTO50K_TOKENIZERS: Final[set[str]] = {
    "char_wordlevel_47",
    "char_bpe_47",
    "char_bpe_88",
    "char_bpe_176",
    "char_wordpiece_94",
    "char_wordpiece_176",
    "char_unigram_88",
    "char_unigram_176",
    "atom_wordlevel_86",
    "smarts_wordlevel_947",
}
validity: Metric = {
    "char_wordlevel_47": Sample(
        [
            0.7564622402432843,
            0.7521546261089987,
            0.7727957059632127,
            0.7647479674796748,
            0.773511075739488,
        ]
    ),
    "char_bpe_47": Sample(
        [
            0.7526333006215243,
            0.7697592253336822,
            0.7536166365280289,
            0.7747238633444644,
            0.7591536804763714,
        ]
    ),
    "char_bpe_88": Sample(
        [
            0.734920634920635,
            0.7273118279569892,
            0.7357945819767799,
            0.7056446890736621,
            0.7728643861112894,
        ]
    ),
    "char_bpe_176": 0.6884913743197869,
    "char_wordpiece_94": 0.731087584215592,
    "char_wordpiece_176": 0.6867148613415501,
    "char_unigram_88": 0.7195553926957371,
    "char_unigram_176": 0.7116800920598388,
    "atom_wordlevel_86": Sample(
        [
            0.7702271567126084,
            0.7819990954319312,
            0.7195848430878831,
            0.7434729617037654,
            0.7591343992808065,
        ]
    ),
    "smarts_wordlevel_947": Sample(
        [
            0.8099435601496607,
            0.7849000186880957,
            0.8018752020691885,
            0.7866085331672377,
            0.838874510584644,
        ]
    ),
}
uniqueness: Metric = {
    "char_wordlevel_47": Sample(
        [
            0.8380234505862647,
            0.8474176425983655,
            0.8493139081822801,
            0.8511651641435618,
            0.8606704964417389,
        ]
    ),
    "char_bpe_47": Sample(
        [
            0.8739568845618915,
            0.8506587335316617,
            0.8609135315793984,
            0.8315649867374005,
            0.8349465954606141,
        ]
    ),
    "char_bpe_88": Sample(
        [
            0.8315334773218143,
            0.8449776125707528,
            0.8374519953247621,
            0.8317152103559871,
            0.8309250954990866,
        ]
    ),
    "char_bpe_176": 0.8446985621794333,
    "char_wordpiece_94": 0.8800245743373706,
    "char_wordpiece_176": 0.8634912416947105,
    "char_unigram_88": 0.8515532167713461,
    "char_unigram_176": 0.8112216023930795,
    "atom_wordlevel_86": Sample(
        [
            0.830668544445365,
            0.8267371726018342,
            0.855509089357344,
            0.8660051768766178,
            0.8486719675181864,
        ]
    ),
    "smarts_wordlevel_947": Sample(
        [
            0.7864860632633887,
            0.7954761904761904,
            0.8079993548907346,
            0.7938870852799114,
            0.7912348706589668,
        ]
    ),
}
feasibility: Metric = {
    "char_wordlevel_47": Sample(
        [
            0.10433739756146312,
            0.10866971564923444,
            0.09913234267477809,
            0.10281774580335731,
            0.1026100816895796,
        ]
    ),
    "char_bpe_47": Sample(
        [
            0.09757310523174856,
            0.10331734612310152,
            0.0970535536531953,
            0.10287081339712918,
            0.11023385968418949,
        ]
    ),
    "char_bpe_88": Sample(
        [
            0.1047952047952048,
            0.09878024395120975,
            0.10577210646994317,
            0.10116731517509728,
            0.10893463921647012,
        ]
    ),
    "char_bpe_176": 0.09755126418475014,
    "char_wordpiece_94": 0.08766330906552308,
    "char_wordpiece_176": 0.08843809333466573,
    "char_unigram_88": 0.09608292634306788,
    "char_unigram_176": 0.10025911899541559,
    "atom_wordlevel_86": Sample(
        [
            0.11229679864366211,
            0.11563062162702378,
            0.10085794094173982,
            0.1026203048719737,
            0.10814312767866043,
        ]
    ),
    "smarts_wordlevel_947": Sample(
        [
            0.11428571428571428,
            0.10426020153646613,
            0.10968063872255489,
            0.10941551964891283,
            0.11127774445110977,
        ]
    ),
}
known_either: Metric = {
    "char_wordlevel_47": Sample([728.0, 726.0, 679.0, 699.0, 694.0]),
    "char_bpe_47": Sample([663.0, 686.0, 686.0, 693.0, 764.0]),
    "char_bpe_88": Sample([691.0, 631.0, 694.0, 694.0, 730.0]),
    "char_bpe_176": 618.0,
    "char_wordpiece_94": 587.0,
    "char_wordpiece_176": 557.0,
    "char_unigram_88": 621.0,
    "char_unigram_176": 616.0,
    "atom_wordlevel_86": Sample([775.0, 782.0, 670.0, 680.0, 747.0]),
    "smarts_wordlevel_947": Sample([746.0, 699.0, 771.0, 721.0, 739.0]),
}
known_val: Metric = {
    "char_wordlevel_47": Sample([498.0, 499.0, 467.0, 481.0, 501.0]),
    "char_bpe_47": Sample([453.0, 467.0, 473.0, 469.0, 524.0]),
    "char_bpe_88": Sample([456.0, 413.0, 475.0, 479.0, 498.0]),
    "char_bpe_176": 410.0,
    "char_wordpiece_94": 406.0,
    "char_wordpiece_176": 385.0,
    "char_unigram_88": 424.0,
    "char_unigram_176": 416.0,
    "atom_wordlevel_86": Sample([536.0, 525.0, 455.0, 456.0, 508.0]),
    "smarts_wordlevel_947": Sample([500.0, 479.0, 508.0, 478.0, 483.0]),
}
known_test: Metric = {
    "char_wordlevel_47": Sample([503.0, 493.0, 478.0, 497.0, 486.0]),
    "char_bpe_47": Sample([465.0, 466.0, 461.0, 470.0, 526.0]),
    "char_bpe_88": Sample([490.0, 438.0, 478.0, 481.0, 496.0]),
    "char_bpe_176": 431.0,
    "char_wordpiece_94": 409.0,
    "char_wordpiece_176": 396.0,
    "char_unigram_88": 433.0,
    "char_unigram_176": 395.0,
    "atom_wordlevel_86": Sample([543.0, 527.0, 467.0, 467.0, 515.0]),
    "smarts_wordlevel_947": Sample([519.0, 477.0, 532.0, 500.0, 527.0]),
}

uspto50k_tokenizers = build_tokenizer_results(
    USPTO50K_TOKENIZERS,
    validity=validity,
    uniqueness=uniqueness,
    feasibility=feasibility,
    known_either=known_either,
    known_val=known_val,
    known_test=known_test,
)

print_basic_stats(uspto50k_tokenizers, sort_by="known_either", reverse=False)

Printing basic stats for 10 tokenizers sorted by KNOWN_EITHER in ASCENDING order...

*** CHAR_WORDPIECE_176 ***
Metric: validity        Single value: 0.687
Metric: uniqueness      Single value: 0.863
Metric: feasibility     Single value: 0.088
Metric: known_either    Single value: 557.000
Metric: known_val       Single value: 385.000
Metric: known_test      Single value: 396.000

*** CHAR_WORDPIECE_94 ***
Metric: validity        Single value: 0.731
Metric: uniqueness      Single value: 0.880
Metric: feasibility     Single value: 0.088
Metric: known_either    Single value: 587.000
Metric: known_val       Single value: 406.000
Metric: known_test      Single value: 409.000

*** CHAR_UNIGRAM_176 ***
Metric: validity        Single value: 0.712
Metric: uniqueness      Single value: 0.811
Metric: feasibility     Single value: 0.100
Metric: known_either    Single value: 616.000
Metric: known_val       Single value: 416.000
Metric: known_test      Single value: 395.000

*** CHAR_BPE_176 ***
Met

In [20]:
criteria = [
    "validity",
    "uniqueness",
    "feasibility",
    "known_val",
    "known_test",
    "known_either",
]
reverse = False
alternative = "greater"
for criterion in criteria:
    best_tokenizer, _ = get_best_tokenizer(
        uspto50k_tokenizers, sort_by=criterion, reverse=reverse
    )
    print_comparison(
        best_tokenizer,
        uspto50k_tokenizers,
        criterion=criterion,
        alternative=alternative,
        reverse=reverse,
    )


*** Perform statistical tests ***
Compare metric VALIDITY with other tokenizers (in ascending order)
Alternative: GREATER
Best tokenizer: smarts_wordlevel_947 with metric mean 0.804 and std.dev. 0.022

char_wordpiece_176   with metric mean 0.687
t-statistic: 12.008, p-value:  0.000, Reject H0 (p<0.05): True

char_bpe_176         with metric mean 0.688
t-statistic: 11.827, p-value:  0.000, Reject H0 (p<0.05): True

char_unigram_176     with metric mean 0.712
t-statistic:  9.461, p-value:  0.000, Reject H0 (p<0.05): True

char_unigram_88      with metric mean 0.720
t-statistic:  8.658, p-value:  0.000, Reject H0 (p<0.05): True

char_wordpiece_94    with metric mean 0.731
t-statistic:  7.482, p-value:  0.001, Reject H0 (p<0.05): True

char_bpe_88          with metric mean 0.735
t-statistic:  4.727, p-value:  0.001, Reject H0 (p<0.05): True

atom_wordlevel_86    with metric mean 0.755
t-statistic:  3.385, p-value:  0.005, Reject H0 (p<0.05): True

char_bpe_47          with metric mean 0.7

# Archive

The following cells have been used to check my understanding of the statistical tests and the scipy package.

In [None]:
def compare_two_samples(s1: Sample, s2: Sample) -> tuple[float, float]:
    t_stat, p_value = ttest_ind(s1.array, s2.array)
    return t_stat, p_value


# Same calculation for both one and two sample t-tests
def _calculate_p_value(t_statistic: float, df: int, alternative: str) -> float:
    alternative = alternative.strip().lower()
    if alternative == "two_sided":
        p_value = (1 - t_statistic.cdf(abs(t_statistic), df)) * 2.0
    elif alternative == "less":
        p_value = t_statistic.cdf(t_statistic, df)
    elif alternative == "greater":
        p_value = 1 - t_statistic.cdf(t_statistic, df)
    else:
        raise ValueError("alternative must be 'two_sided', 'greater' or 'less'")

    return p_value