In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import datasets
from tqdm.notebook import tqdm

import utils

In [2]:
train_ds, valid_ds, test_ds = utils.load_dataset()

Found cached dataset parquet (/home/johnny/.cache/huggingface/datasets/parquet/civil_comments-e92c9358c47debf9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/johnny/.cache/huggingface/datasets/parquet/civil_comments-e92c9358c47debf9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-26324163aea91a3b.arrow
Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/parquet/civil_comments-e92c9358c47debf9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-392dd59de6b13b83.arrow
Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/parquet/civil_comments-e92c9358c47debf9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-2bef193312149c10.arrow


In [19]:
score_npy = 'scores/roberta_balanced.npy'
score_column = 'scores'

bins = 8
cut = 'oracle'
allocation = 'optimal'
pilot_size = 50

In [20]:
labels = test_ds['toxicity']
scores = np.load(score_npy)
df = pd.DataFrame(data={'toxicity' : labels, 'toxic' : test_ds['label'], 'scores' : scores})

In [21]:
df.head(1)

Unnamed: 0,toxicity,toxic,scores
0,0.4,0,0.27451


In [22]:
df['toxic'].sum(), df['toxic'].mean()

(100601, 0.059007879761202295)

In [23]:
size = 12192

In [24]:
p = df['toxic'].mean()
random_sampling_var = p * (1 - p)
np.sqrt(random_sampling_var / size)

0.0021340791433234136

### Cutting

In [25]:
def get_error(df):
    sizes_sigmas = []
    for i, group in df.groupby('bin'):
        # to round up to 1
        pilot = group['toxic']
        sizes_sigmas.append((len(group), np.std(pilot)))

    allocations = []
    denominator = np.sum([ n_h * sigma_h for n_h, sigma_h in sizes_sigmas ])
    for n_h, sigma_h in sizes_sigmas:
        n_from_bin = size * n_h * sigma_h / denominator
        allocations.append(n_from_bin)
    
    stratified_var = 0
    for (i, group), n_from_bin in zip(df.groupby('bin'), allocations):

        p = group['toxic'].mean()

        # approximation when the groups are very large
        stratified_var += np.square(len(group) / len(df)) * (group['toxic'].var() / n_from_bin)
    return np.sqrt(stratified_var)

In [26]:
def oracle_bins(df, depth=4):
    minimum_bins = [0, 1]
    minimum_err = 1
    steps = 10

    for level in range(1, depth+1):
        if level == 1:
            indexes = [1]
        else:
            indexes = list(range(1, level*2, 2))

        for idx in indexes:
            bins = minimum_bins.copy()
            bins.insert(idx, 0)
            lb, ub = bins[idx-1], bins[idx+1]

            # print(lb + (ub-lb)/steps, ub)

            for i in np.linspace( lb + (ub-lb)/steps, ub, steps, endpoint=False):
                bins[idx] = i
                df['bin'] = pd.qcut(df[score_column], bins)
                new_err = get_error(df)
                if new_err < minimum_err:
                    minimum_err = new_err
                    minimum_bins = bins.copy()

    return minimum_bins

In [27]:
if cut == 'eqwidth':
    minimum, maximum = df[score_column].min(), df[score_column].max()
    df['bin'] = pd.cut(df[score_column], np.linspace(minimum, maximum, num=bins+1), include_lowest=True)
elif cut == 'quantile':
    df['bin'] = pd.qcut(df[score_column], np.linspace(0, 1, num=bins+1))
elif cut == 'oracle':
    depth = int(np.log(bins) / np.log(2))
    b = oracle_bins(df, depth = depth)
    df['bin'] = pd.qcut(df[score_column], b)

### Allocation

In [28]:
sizes_sigmas = []
for i, group in df.groupby('bin'):
    # to round up to 1
    if allocation == 'pilot':
        pilot = np.array(group['toxic'].sample(pilot_size + 2))
        pilot[-1] = 1
        pilot[-2] = 0
    elif allocation == 'optimal':
        pilot = group['toxic']
    sizes_sigmas.append((len(group), np.std(pilot)))

allocations = []
denominator = np.sum([ n_h * sigma_h for n_h, sigma_h in sizes_sigmas ])
for n_h, sigma_h in sizes_sigmas:
    n_from_bin = size * n_h * sigma_h / denominator
    print(n_h, sigma_h, n_from_bin)
    allocations.append(n_from_bin)

561297 0.01698628729836822 603.5745903740093
459240 0.04348385219212084 1264.1758253849894
241574 0.09110459581787501 1393.253806281419
135885 0.16667297983866722 1433.760068372575
108021 0.27078115469248054 1851.681942246734
88381 0.3994797198727792 2235.0817365140215
110476 0.4876483834326358 3410.4720308262517


In [29]:
minimum = np.min(allocations)
multipliers = [ i / minimum for i in allocations ]
multipliers

[1.0,
 2.0944815198426987,
 2.3083374093301035,
 2.375448024550098,
 3.0678593363238273,
 3.703074602807645,
 5.650456605061735]

In [30]:
stratified_var = 0
for (i, group), n_from_bin in zip(df.groupby('bin'), allocations):
    print(i, len(group), n_from_bin, group['toxic'].mean(), group['toxic'].var())
    
    p = group['toxic'].mean()
    
    # approximation when the groups are very large
    stratified_var += np.square(len(group) / len(df)) * (group['toxic'].var() / n_from_bin)
'stderr: ', np.sqrt(stratified_var)

(-0.000352, 0.00113] 561297 603.5745903740093 0.00028861725610505667 0.00028853447023225554
(0.00113, 0.00245] 459240 1264.1758253849894 0.0018944342827279853 0.001890849518811215
(0.00245, 0.0218] 241574 1393.253806281419 0.008370106054459503 0.008300081737478827
(0.0218, 0.483] 135885 1433.760068372575 0.02859771129999632 0.027780086646514306
(0.483, 0.965] 108021 1851.681942246734 0.07966969385582433 0.0733231125223155
(0.965, 0.99] 88381 2235.0817365140215 0.19930754347653906 0.15958585224754981
(0.99, 0.998] 110476 3410.4720308262517 0.6104493283609109 0.23780309839622008


('stderr: ', 0.001023074103405929)

In [31]:
numerator = 0
for (i, group), multip_h in zip(df.groupby('bin'), multipliers):
    print(i, len(group), multip_h, group['toxic'].mean(), group['toxic'].var())
    
    # approximation when the groups are very large
    numerator += np.square(len(group) / len(df)) * (group['toxic'].var() / multip_h)

(-0.000352, 0.00113] 561297 1.0 0.00028861725610505667 0.00028853447023225554
(0.00113, 0.00245] 459240 2.0944815198426987 0.0018944342827279853 0.001890849518811215
(0.00245, 0.0218] 241574 2.3083374093301035 0.008370106054459503 0.008300081737478827
(0.0218, 0.483] 135885 2.375448024550098 0.02859771129999632 0.027780086646514306
(0.483, 0.965] 108021 3.0678593363238273 0.07966969385582433 0.0733231125223155
(0.965, 0.99] 88381 3.703074602807645 0.19930754347653906 0.15958585224754981
(0.99, 0.998] 110476 5.650456605061735 0.6104493283609109 0.23780309839622008


In [32]:
p = 0.05897253769515213
print(p, end=' ')

for within in [0.2, 0.1, 0.05]:
    desired_ci = p * within

    alpha = 0.05
    z_statistic = stats.norm.ppf(1 - (alpha / 2))
    desired_var = np.square(desired_ci / z_statistic)

    minimum = numerator / desired_var
    n = np.sum([minimum * multip for multip in multipliers])
    print(int(n+1), end=' ')

0.05897253769515213 353 1410 5639 

In [17]:
cached_groupby = list(df.groupby('bin'))

In [18]:
p = 0.05897253769515213
print(p, end=' ')

for within in [0.2, 0.1, 0.05]:
    ns = []
    for i in tqdm(range(0, 1000)):
        sizes_sigmas = []
        for i, group in cached_groupby:
            # to round up to 1
            pilot = np.array(group['toxic'].sample(pilot_size+1))
            pilot[-1] = 1

            sizes_sigmas.append((len(group), np.std(pilot)))

        allocations = []
        denominator = np.sum([ n_h * sigma_h for n_h, sigma_h in sizes_sigmas ])
        for n_h, sigma_h in sizes_sigmas:
            proportion = n_h * sigma_h / denominator
            allocations.append(proportion)
            
        
        numerator = 0
        for (i, group), multip_h in zip(cached_groupby, allocations):
            # approximation when the groups are very large
            numerator += np.square(len(group) / len(df)) * (group['toxic'].var() / multip_h)

        desired_ci = p * within

        alpha = 0.05
        z_statistic = stats.norm.ppf(1 - (alpha / 2))
        desired_var = np.square(desired_ci / z_statistic)

        minimum = numerator / desired_var
        l = [minimum * multip for multip in allocations]
        l = [ np.max([pilot_size, n]) for n in l ]
        n = np.sum(l)
        ns.append(n)
    print(int(np.mean(ns)+1), np.std(ns), end=' ')

0.05897253769515213 

  0%|          | 0/1000 [00:00<?, ?it/s]

568 16.358317385958472 

  0%|          | 0/1000 [00:00<?, ?it/s]

2214 84.23058874415905 

  0%|          | 0/1000 [00:00<?, ?it/s]

8847 342.87277060978835 