# Setup

Prepare the statistical tests

In [114]:
from dataclasses import dataclass
from typing import Sequence

import numpy as np
from scipy.stats import shapiro, ttest_1samp, ttest_ind  # t

In [115]:
ALTERNATIVES = ("two-sided", "less", "greater")
ALPHA = 0.05

In [116]:
@dataclass
class Sample:
    values: Sequence[float]
    array: np.ndarray = None
    ddof: int = 1
    mean: float = None
    std: float = None
    variance: float = None

    def __post_init__(self):
        self.array = np.array(self.values)
        self.mean, self.std, self.var = self.get_basic_stats()

    def get_basic_stats(self) -> tuple[float, float, float]:
        mean = np.mean(self.array)
        std = np.std(self.array, ddof=self.ddof)
        var = np.var(self.array, ddof=self.ddof)
        return mean, std, var

    def __len__(self):
        return len(self.array)

    def __repr__(self):
        return f"Sample({self.values}"

    def __str__(self):
        return f"{self.array})"

In [117]:
# This was/is only a test whether my two_sample_t_test function works (it does)
def compare_two_samples(s1: Sample, s2: Sample) -> tuple[float, float]:
    t_stat, p_value = ttest_ind(s1.array, s2.array)  # just to check; fake!
    return t_stat, p_value

In [118]:
# Same calculation for both one and two sample t-tests
def _calculate_p_value(t_statistic: float, df: int, alternative: str) -> float:
    alternative = alternative.strip().lower()
    if alternative == "two_sided":
        p_value = (1 - t.cdf(abs(t_statistic), df)) * 2.0
    elif alternative == "less":
        p_value = t.cdf(t_statistic, df)
    elif alternative == "greater":
        p_value = 1 - t.cdf(t_statistic, df)
    else:
        raise ValueError("alternative must be 'two_sided', 'greater' or 'smaller'")

    return p_value

In [119]:
def is_normally_distributed(sample: Sample, alpha: float = ALPHA):
    _, p_value = shapiro(sample.array)
    return p_value > alpha

In [120]:
def one_sample_t_test(
    sample: Sample, population_mean: float, alternative: str = "two_sided"
) -> tuple[float, float]:
    # n = len(sample)
    # df = n - 1
    # t_statistic = (sample.mean - population_mean) / (sample.std / np.sqrt(n))
    # p_value = _calculate_p_value(t_statistic, df, alternative)

    if not is_normally_distributed(sample):
        print("Warning: sample is not normally distributed!")
        # print("Therefore, we do a Wilcoxon signed-rank test instead of a Student's t-test!")
        # test_statistic, p_value = wilcoxon(sample.array - population_mean, alternative=alternative)
        # return test_statistic, p_value
    test_statistic, p_value = ttest_1samp(
        sample.array, population_mean, alternative=alternative
    )
    return test_statistic, p_value

In [121]:
def two_sample_t_test(
    sample: Sample, baseline: Sample, alternative: str = "different"
) -> tuple[float, float]:
    # n1 = len(sample)
    # n2 = len(baseline)
    if sample.std > 2.0 * baseline.std or baseline.std > 2.0 * sample.std:
        print("Warning: standard deviations differ by more than a factor of 2")
        print("Therefore, we do a Welch's t-test instead of a Student's t-test!")
        equal_var = False
    else:
        equal_var = True

    # pooled_std = np.sqrt(((n1 - 1) * sample.std ** 2 + (n2 - 1) * baseline.std ** 2) / (n1 + n2 - 2))
    # t_statistic = (sample.mean - baseline.mean) / (pooled_std * np.sqrt(1 / n1 + 1 / n2))
    # df = n1 + n2 - 2
    # p_value = _calculate_p_value(t_statistic, df, alternative)

    if not is_normally_distributed(sample):
        print("Warning: sample is not normally distributed!")
        # print("Therefore, we do a Wilcoxon signed-rank test instead of a Student's t-test!")
        # test_statistic, p_value = wilcoxon(sample.array - population_mean, alternative=alternative)
        # return test_statistic, p_value
    if not is_normally_distributed(baseline):
        print("Warning: baseline is not normally distributed!")
        # print("Therefore, we do a Wilcoxon signed-rank test instead of a Student's t-test!")
        # test_statistic, p_value = wilcoxon(sample.array - population_mean, alternative=alternative)
        # return test_statistic, p_value

    test_statistic, p_value = ttest_ind(
        sample.array, baseline.array, equal_var=equal_var, alternative=alternative
    )
    return test_statistic, p_value

In [122]:
def interpret_p_value(
    p_value: float,
    # alternative: str,
    alpha: float = ALPHA,
) -> str:
    if p_value < alpha:
        return "Reject H0"
    else:
        return "Cannot reject H0"

# Example Usage

In [123]:
print("Shapiro-Wilk test example")
sample = Sample([2.5, 3.1, 2.8, 3.4, 2.9, 3.0, 3.3, 2.6, 3.2, 3.1])
result = is_normally_distributed(sample)
print(f"Sample is normally distributed: {result}")

Shapiro-Wilk test example
Sample is normally distributed: True


In [124]:
print("One sample t-test example")
sample = Sample([10.0, 11.0, 12.0, 13.0, 14.0])
population_mean = 14.0
print(f"Sample: {sample}")
print(f"Population mean: {population_mean}")
for alternative in ALTERNATIVES:
    t_stat, p_val = one_sample_t_test(sample, population_mean, alternative)
    print(
        f"Alternative: {alternative:9s}, "
        f"t-statistic: {t_stat:.3f}, "
        f"p-value: {p_val:.3f}, "
        f"Reject H0 (p<0.05): {p_val < 0.05}"
    )

print("\nTwo sample t-test example")
sample = Sample(np.array([10.0, 11.0, 12.0]))
baseline = Sample(np.array([12.0, 13.0, 14.0]))
print(f"Sample: {sample}")
print(f"Baseline: {baseline}")
for alternative in ALTERNATIVES:
    t_stat, p_val = two_sample_t_test(sample, baseline, alternative)
    print(
        f"Alternative: {alternative:9s}, "
        f"t-statistic: {t_stat:.3f}, "
        f"p-value: {p_val:.3f}, "
        f"Reject H0 (p<0.05): {p_val < 0.05}"
    )

One sample t-test example
Sample: [10. 11. 12. 13. 14.])
Population mean: 14.0
Alternative: two-sided, t-statistic: -2.828, p-value: 0.047, Reject H0 (p<0.05): True
Alternative: less     , t-statistic: -2.828, p-value: 0.024, Reject H0 (p<0.05): True
Alternative: greater  , t-statistic: -2.828, p-value: 0.976, Reject H0 (p<0.05): False

Two sample t-test example
Sample: [10. 11. 12.])
Baseline: [12. 13. 14.])
Alternative: two-sided, t-statistic: -2.449, p-value: 0.070, Reject H0 (p<0.05): False
Alternative: less     , t-statistic: -2.449, p-value: 0.035, Reject H0 (p<0.05): True
Alternative: greater  , t-statistic: -2.449, p-value: 0.965, Reject H0 (p<0.05): False


# Experiments

## Tokenizers

### Guacamol/SMILES

#### The initial "best" tokenizer

This is based on a single run for each tokenizer.
A single run consists of:

- a training with fixed seed
- a generation with fixed seed
- FCD as the metric 


#### Statistical tests for the "best" tokenizer

This is based on 5 runs for the "best" tokenizer.
The 5 runs are:

- a training with random seed
- a generation with fixed seed
- FCD as the metric

At this point we have 5 FCD values for the "best" tokenizer.
We then do a one-sample t-test and compare the sample with the "next best" tokenizer. If the current "best" tokenizer can be considered "done" and we declare the "best" tokenizer as the "winner". If not, we repeat the process with the "next best" tokenizer and perform a two-sample t-test. We repeat this process until we have a winner, or, we conculude that we have a couple of tokenizers which seem to perform equally well.

### USPTO50K/SMARTS

#### The initial "best" tokenizer

#### Statistical tests for the "best" tokenizer

In [127]:
atom_smarts_947_fcd = Sample(
    [
        711.0,
        712.0,
        708.0,
        738,
        688.0,
    ]
)
# TODO Check this number, not correct!
char_unigram_88_fcd = 690

t_stat, p_val = one_sample_t_test(
    atom_smarts_947_fcd, char_unigram_88_fcd, alternative="greater"
)
print(f"t-statistic: {t_stat:.3f}, p-value: {p_val:.3f}")
print(interpret_p_value(p_val))

t-statistic: 2.688, p-value: 0.027
Reject H0
