In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import datasets
from tqdm.notebook import tqdm

import utils

In [2]:
train_ds, valid_ds, test_ds = utils.load_dataset()

Found cached dataset civil_comments (/home/johnny/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/johnny/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab/cache-01f767df1b53d154.arrow
Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab/cache-52cd85a4b8b2e0c8.arrow
Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab/cache-9e6abfbab2489ee3.arrow


In [3]:
keyword_labels = []
keyword_list = ["stupid", "idiot", "idiots", "stupidity", "pathetic", "crap", \
                "ignorant", "dumb", "moron", "fools"]

for line in test_ds['text']:
    label = any(word in line for word in keyword_list)
    keyword_labels.append(1 if label else 0)

In [4]:
keyword_labels = np.array(keyword_labels)
keyword_labels

array([0, 0, 1, ..., 0, 0, 0])

In [5]:
score_npy = 'scores/ngram.npy'
score_column = 'scores'
bins = 8
cut = 'quantile'
allocation = 'pilot'
pilot_size = 50

In [6]:
unmoderated = (1 - keyword_labels).astype(bool)

scores = np.load(score_npy)

toxic = np.array(test_ds['label'])[unmoderated]
toxicity = np.array(test_ds['toxicity'])[unmoderated]
subset_scores = scores[unmoderated]

print(len(toxic), len(toxicity), len(subset_scores))

1647864 1647864 1647864


In [7]:
# labels = test_ds['toxicity']
# scores = np.load(score_npy)
df = pd.DataFrame(data={'toxicity' : toxicity, 'toxic' : toxic, 'scores' : subset_scores})
df.head(1)

Unnamed: 0,toxicity,toxic,scores
0,0.4,0,0.036676


In [8]:
df['toxic'].sum(), df['toxic'].mean()

(67585, 0.041013700159721916)

In [9]:
size = 12192

In [10]:
p = df['toxic'].mean()
random_sampling_var = p * (1 - p)
np.sqrt(random_sampling_var / size)

0.0017961111150782238

### Cutting

In [11]:
def get_error(df):
    sizes_sigmas = []
    for i, group in df.groupby('bin'):
        # to round up to 1
        pilot = group['toxic']
        sizes_sigmas.append((len(group), np.std(pilot)))

    allocations = []
    denominator = np.sum([ n_h * sigma_h for n_h, sigma_h in sizes_sigmas ])
    for n_h, sigma_h in sizes_sigmas:
        n_from_bin = size * n_h * sigma_h / denominator
        allocations.append(n_from_bin)
    
    stratified_var = 0
    for (i, group), n_from_bin in zip(df.groupby('bin'), allocations):

        p = group['toxic'].mean()

        # approximation when the groups are very large
        stratified_var += np.square(len(group) / len(df)) * (group['toxic'].var() / n_from_bin)
    return np.sqrt(stratified_var)

In [12]:
def oracle_bins(df, depth=4):
    minimum_bins = [0, 1]
    minimum_err = 1
    steps = 10

    for level in range(1, depth+1):
        if level == 1:
            indexes = [1]
        else:
            indexes = list(range(1, level*2, 2))

        for idx in indexes:
            bins = minimum_bins.copy()
            bins.insert(idx, 0)
            lb, ub = bins[idx-1], bins[idx+1]

            # print(lb + (ub-lb)/steps, ub)

            for i in np.linspace( lb + (ub-lb)/steps, ub, steps, endpoint=False):
                bins[idx] = i
                df['bin'] = pd.qcut(df[score_column], bins)
                new_err = get_error(df)
                if new_err < minimum_err:
                    minimum_err = new_err
                    minimum_bins = bins.copy()

    return minimum_bins

In [13]:
if cut == 'eqwidth':
    minimum, maximum = df[score_column].min(), df[score_column].max()
    df['bin'] = pd.cut(df[score_column], np.linspace(minimum, maximum, num=bins+1), include_lowest=True)
elif cut == 'quantile':
    df['bin'] = pd.qcut(df[score_column], np.linspace(0, 1, num=bins+1))
elif cut == 'oracle':
    depth = int(np.log(bins) / np.log(2))
    b = oracle_bins(df, depth = depth)
    df['bin'] = pd.qcut(df[score_column], b)

### Allocation

In [14]:
sizes_sigmas = []
for i, group in df.groupby('bin'):
    # to round up to 1
    if allocation == 'pilot':
        pilot = np.array(group['toxic'].sample(pilot_size + 2))
        pilot[-1] = 1
        pilot[-2] = 0
    elif allocation == 'optimal':
        pilot = group['toxic']
    sizes_sigmas.append((len(group), np.std(pilot)))

allocations = []
denominator = np.sum([ n_h * sigma_h for n_h, sigma_h in sizes_sigmas ])
for n_h, sigma_h in sizes_sigmas:
    n_from_bin = size * n_h * sigma_h / denominator
    print(n_h, sigma_h, n_from_bin)
    allocations.append(n_from_bin)

205983 0.1373351620873625 1112.864117286445
205983 0.1373351620873625 1112.864117286445
205983 0.1373351620873625 1112.864117286445
205983 0.1373351620873625 1112.864117286445
205983 0.1923076923076923 1558.32145966562
205983 0.1923076923076923 1558.32145966562
205983 0.1923076923076923 1558.32145966562
205983 0.3783137610174231 3065.579151857359


In [15]:
minimum = np.min(allocations)
multipliers = [ i / minimum for i in allocations ]
multipliers

[1.0,
 1.0,
 1.0,
 1.0,
 1.4002800840280099,
 1.4002800840280099,
 1.4002800840280099,
 2.7546751703454344]

In [16]:
stratified_var = 0
for (i, group), n_from_bin in zip(df.groupby('bin'), allocations):
    print(i, len(group), n_from_bin, group['toxic'].mean(), group['toxic'].var())
    
    p = group['toxic'].mean()
    
    # approximation when the groups are very large
    stratified_var += np.square(len(group) / len(df)) * (group['toxic'].var() / n_from_bin)
'stderr: ', np.sqrt(stratified_var)

(5.999999999999994e-05, 0.011] 205983 1112.864117286445 0.0011165970007233606 0.001115355626651028
(0.011, 0.0153] 205983 1112.864117286445 0.0025244801755484676 0.0025181194002706925
(0.0153, 0.0196] 205983 1112.864117286445 0.0041168445939713475 0.004099916088727541
(0.0196, 0.0246] 205983 1112.864117286445 0.006553938917289291 0.006511016411429947
(0.0246, 0.0315] 205983 1558.32145966562 0.010529995193778128 0.010419164977604187
(0.0315, 0.0421] 205983 1558.32145966562 0.017690780307112724 0.017377900965017536
(0.0421, 0.0698] 205983 1558.32145966562 0.04062471174805688 0.038974533755975586
(0.0698, 1.0] 205983 3065.579151857359 0.24495225334129517 0.1849515448215782


('stderr: ', 0.0013461743583270652)

In [17]:
numerator = 0
for (i, group), multip_h in zip(df.groupby('bin'), multipliers):
    print(i, len(group), multip_h, group['toxic'].mean(), group['toxic'].var())
    
    # approximation when the groups are very large
    numerator += np.square(len(group) / len(df)) * (group['toxic'].var() / multip_h)

(5.999999999999994e-05, 0.011] 205983 1.0 0.0011165970007233606 0.001115355626651028
(0.011, 0.0153] 205983 1.0 0.0025244801755484676 0.0025181194002706925
(0.0153, 0.0196] 205983 1.0 0.0041168445939713475 0.004099916088727541
(0.0196, 0.0246] 205983 1.0 0.006553938917289291 0.006511016411429947
(0.0246, 0.0315] 205983 1.4002800840280099 0.010529995193778128 0.010419164977604187
(0.0315, 0.0421] 205983 1.4002800840280099 0.017690780307112724 0.017377900965017536
(0.0421, 0.0698] 205983 1.4002800840280099 0.04062471174805688 0.038974533755975586
(0.0698, 1.0] 205983 2.7546751703454344 0.24495225334129517 0.1849515448215782


In [18]:
p = df.toxic.mean()
print(p, end=' ')

for within in [0.2, 0.1, 0.05]:
    desired_ci = p * within

    alpha = 0.05
    z_statistic = stats.norm.ppf(1 - (alpha / 2))
    desired_var = np.square(desired_ci / z_statistic)

    minimum = numerator / desired_var
    n = np.sum([minimum * multip for multip in multipliers])
    print(int(n+1), end=' ')

0.041013700159721916 1262 5046 20183 

In [19]:
cached_groupby = list(df.groupby('bin'))

In [20]:
p = df.toxic.mean()
print(p, end=' ')

for within in [0.2, 0.1, 0.05]:
    ns = []
    for i in tqdm(range(0, 1000)):
        sizes_sigmas = []
        for i, group in cached_groupby:
            # to round up to 1
            pilot = np.array(group['toxic'].sample(pilot_size+1))
            pilot[-1] = 1

            sizes_sigmas.append((len(group), np.std(pilot)))

        allocations = []
        denominator = np.sum([ n_h * sigma_h for n_h, sigma_h in sizes_sigmas ])
        for n_h, sigma_h in sizes_sigmas:
            proportion = n_h * sigma_h / denominator
            allocations.append(proportion)
            
        
        numerator = 0
        for (i, group), multip_h in zip(cached_groupby, allocations):
            # approximation when the groups are very large
            numerator += np.square(len(group) / len(df)) * (group['toxic'].var() / multip_h)

        desired_ci = p * within

        alpha = 0.05
        z_statistic = stats.norm.ppf(1 - (alpha / 2))
        desired_var = np.square(desired_ci / z_statistic)

        minimum = numerator / desired_var
        l = [minimum * multip for multip in allocations]
        l = [ np.max([pilot_size, n]) for n in l ]
        n = np.sum(l)
        ns.append(n)
    print(int(np.mean(ns)+1), np.std(ns), end=' ')

0.041013700159721916 

  0%|          | 0/1000 [00:00<?, ?it/s]

1243 46.468472699753356 

  0%|          | 0/1000 [00:00<?, ?it/s]

4972 189.1469129474968 

  0%|          | 0/1000 [00:00<?, ?it/s]

19879 726.2383724460425 