In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import datasets
from tqdm.notebook import tqdm

import utils

# Load pilot

In [2]:
score_column = 'label'
annotations_file = './personal_attacks_pilot.tsv'
database = 'before:politics'

In [3]:
# pilot_df = pd.read_csv(annotations_file, delimiter='\t', header=None)
# pilot_df.columns = ['bin', 'link', 'comment', 'raw_label']
# pilot_df = pilot_df.dropna()

df = pd.read_csv(annotations_file, delimiter='\t', header=[0])
df.columns = ['random', 'database', 'bin', 'link', 'comment', 'raw_label', 'notes']
df[score_column] = [ 1 if i == 'z' else 0 for i in df['raw_label'] ]
df.head(2)

Unnamed: 0,random,database,bin,link,comment,raw_label,notes,label
0,0.998825,before:politics,"(-0.00042699999999999997, 0.000759]",https://www.reddit.com/r/politics/comments/hef...,"Wouldn’t that, in its own way, result in a net...",x,,0
1,0.998773,after:politics,"(0.000924, 0.00124]",https://www.reddit.com/r/politics/comments/hpa...,Racism is the point,x,,0


In [4]:
df.groupby('database').count()

Unnamed: 0_level_0,random,bin,link,comment,raw_label,notes,label
database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
after:politics,400,400,400,400,400,9,400
before:politics,400,400,400,400,400,6,400


In [5]:
pilot_df = df[df.database == database]

# Pilot estimate of p

In [6]:
pilot_mean = 0.
for i, group in pilot_df.groupby('bin'):
    strata_mean = group[score_column].mean()
    pilot_mean += strata_mean * (len(group) / len(pilot_df))
pilot_mean

0.08000000000000002

# Allocation

In [7]:
sizes_sigmas = []
for i, group in pilot_df.groupby('bin'):
    pilot = np.array(group[score_column])
    # laplace smoothing
    pilot = np.concatenate([np.array([1,0]), pilot])
    print(pilot.sum(), i)
    sizes_sigmas.append((len(group), np.std(pilot)))
print()

size = 5000
allocations = []
denominator = np.sum([ n_h * sigma_h for n_h, sigma_h in sizes_sigmas ])
for n_h, sigma_h in sizes_sigmas:
    n_from_bin = size * n_h * sigma_h / denominator
    print(n_h, sigma_h, n_from_bin)
    allocations.append(n_from_bin)

2 (-0.00042699999999999997, 0.000759]
3 (0.000759, 0.000938]
3 (0.000938, 0.00128]
3 (0.00128, 0.00217]
2 (0.00217, 0.00678]
3 (0.00678, 0.241]
6 (0.241, 0.992]
18 (0.992, 0.999]

50 0.1923076923076923 455.1690191716224
50 0.23316068563427197 551.8631070655798
50 0.23316068563427197 551.8631070655798
50 0.23316068563427197 551.8631070655798
50 0.1923076923076923 455.1690191716224
50 0.23316068563427197 551.8631070655798
50 0.3194855331891567 756.1835672624027
50 0.47574295680203776 1126.0259661320338


In [8]:
minimum = np.min(allocations)
multipliers = [ i / minimum for i in allocations ]
multipliers

[1.0,
 1.2124355652982144,
 1.2124355652982144,
 1.2124355652982144,
 1.0,
 1.2124355652982144,
 1.661324772583615,
 2.473863375370597]

In [9]:
stratified_var = 0
for (i, group), n_from_bin in zip(pilot_df.groupby('bin'), allocations):
    print(i, len(group), n_from_bin, group[score_column].mean(), group[score_column].var())
    
    p = group[score_column].mean()
    
    # approximation when the groups are very large
    stratified_var += np.square(len(group) / len(pilot_df)) * (group[score_column].var() / n_from_bin)
'stddev of estimator: ', np.sqrt(stratified_var)

(-0.00042699999999999997, 0.000759] 50 455.1690191716224 0.02 0.019999999999999966
(0.000759, 0.000938] 50 551.8631070655798 0.04 0.039183673469387774
(0.000938, 0.00128] 50 551.8631070655798 0.04 0.03918367346938778
(0.00128, 0.00217] 50 551.8631070655798 0.04 0.03918367346938779
(0.00217, 0.00678] 50 455.1690191716224 0.02 0.019999999999999962
(0.00678, 0.241] 50 551.8631070655798 0.04 0.03918367346938779
(0.241, 0.992] 50 756.1835672624027 0.1 0.09183673469387751
(0.992, 0.999] 50 1126.0259661320338 0.34 0.22897959183673466


('stddev of estimator: ', 0.0032993587829579494)

In [10]:
# detectable effect size:
4 * np.sqrt(stratified_var)

0.013197435131831798

In [11]:
numerator = 0
for (i, group), multip_h in zip(pilot_df.groupby('bin'), multipliers):
    print(i, len(group), multip_h, group[score_column].mean(), group[score_column].var())
    
    # approximation when the groups are very large
    numerator += np.square(len(group) / len(pilot_df)) * (group[score_column].var() / multip_h)

(-0.00042699999999999997, 0.000759] 50 1.0 0.02 0.019999999999999966
(0.000759, 0.000938] 50 1.2124355652982144 0.04 0.039183673469387774
(0.000938, 0.00128] 50 1.2124355652982144 0.04 0.03918367346938778
(0.00128, 0.00217] 50 1.2124355652982144 0.04 0.03918367346938779
(0.00217, 0.00678] 50 1.0 0.02 0.019999999999999962
(0.00678, 0.241] 50 1.2124355652982144 0.04 0.03918367346938779
(0.241, 0.992] 50 1.661324772583615 0.1 0.09183673469387751
(0.992, 0.999] 50 2.473863375370597 0.34 0.22897959183673466


# Power analysis

In [12]:
p = pilot_mean
print(p, end=' ')

for within in [0.2, 0.1, 0.05]:
    desired_ci = p * within

    alpha = 0.05
    z_statistic = stats.norm.ppf(1 - (alpha / 2))
    desired_var = np.square(desired_ci / z_statistic)

    minimum = numerator / desired_var
    n = np.sum([minimum * multip for multip in multipliers])
    print(int(n+1), end=' ')

0.08000000000000002 817 3267 13068 