in this notebook, we consider a case where we have two classifiers with two different underlying accuracies. we assume this accuracy applies to each example independently, implying that whether the classifier correctly classifies a given example is sampled from a Bernoulli distribution with the mean set to the classifier's accuracy. We then want to know whether these two classifiers truly have two different accuracies given the sampled accuracies.

there are two knobs of interest in this case. first, we can test the effect of having a different sized validation set `n_samples`. if the validation (test) set is too small, the variance of the accuracy difference is too large, and we cannot draw any sensible statiscal conclusion. this corresponds to the notion of 'statistical power'. second, we can test the effect of having multiple instantiations of the classifier, imitating the case of stochastic learning, by altering `acc_std`. if `acc_std` is too large (we assume it to be shared between the classifiers,) we cannot draw a concrete conclusion unless the true accuracy difference is large. 


In [1]:
import numpy as np
import torch
import pyro
import scipy.stats as stats

In [2]:
from IPython.core.debugger import set_trace

In [3]:
# let's import some plotting libraries for drawing pretty plots.
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [14]:
# this function draws a set of samples from a Bernoulli distribution with a given mean (accuracy).
# this function uses pyro for sampling.
def draw_samples(mean, num_samples):
    samples = pyro.sample("samples", 
                          pyro.distributions.Bernoulli(mean).expand([num_samples, 1]))
    return samples

In [31]:
def compute_acc_diff(n_samples, n_repeats, acc1, acc2, acc_std=0.):
    acc_diffs = []

    for _ in range(n_repeats):
        acc_diffs.append(torch.mean(draw_samples(torch.clip(torch.tensor(acc1+torch.randn(1).item() * acc_std), 0, 1), n_samples)) 
                         - torch.mean(draw_samples(torch.clip(torch.tensor(acc2+torch.randn(1).item() * acc_std), 0, 1), n_samples)))
        
    return acc_diffs

In [32]:
torch.clip(1.1, 0., 1.)

TypeError: clip() received an invalid combination of arguments - got (float, float, float), but expected one of:
 * (Tensor input, Tensor min = None, Tensor max = None, *, Tensor out = None)
 * (Tensor input, Number min = None, Number max = None, *, Tensor out = None)


In [None]:
# create an interactive plot where we can vary the sample size, dimensionality of the data, and the mean difference.
# this plot shows how the difference in the means of two sets of samples drawn from the same distribution changes.
# this plot also shows how the difference in the means of two sets of samples drawn from two different distributions changes.
def plot_acc_diffs(n_samples, n_repeats, acc1, acc2, acc_std):
    mean_diffs_same = compute_acc_diff(n_samples, 5_000, acc1, acc1, acc_std)
    mean_diffs_diff = compute_acc_diff(n_samples, n_repeats, acc1, acc2, acc_std)
    fig, ax = plt.subplots(figsize=(5, 3))
    # use the normalized frequency for the y-axis.
    sns.histplot([md.item() for md in mean_diffs_same], ax=ax, color='blue', label='Same Accuracy', stat='density')
    sns.histplot([md.item() for md in mean_diffs_diff], ax=ax, color='red', label='Different Accuracies', stat='density')
    ax.set_xlabel('Accuracy Difference')
    ax.set_ylabel('Frequency')
    ax.set_title('Accuracy Difference vs Frequency')
    ax.legend()
    plt.show()

# now create an interactive plot.
# we can vary the sample size, dimensionality of the data, and the mean difference.
interact_manual(plot_acc_diffs,
                n_samples=widgets.IntSlider(min=10, max=1000, step=1, value=100),
                n_repeats=widgets.IntSlider(min=10, max=1000, step=10, value=100),
                acc1=widgets.FloatSlider(min=0, max=1, step=0.01, value=0.8),
                acc2=widgets.FloatSlider(min=0, max=1, step=0.01, value=0.9),
                acc_std=widgets.FloatSlider(min=0., max=1., step=0.0001, value=0.001))

interactive(children=(IntSlider(value=100, description='n_samples', max=1000, min=10), IntSlider(value=100, de…

<function __main__.plot_acc_diffs(n_samples, n_repeats, acc1, acc2, acc_std)>