In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from scipy.stats import norm, bernoulli
import json
import warnings; warnings.simplefilter('ignore')
from tqdm.auto import tqdm
from plotting_utils import plot_naive_coverage, plot_eff_sample_size, plot_coverage, plot_intervals
from utils import odds_ratio_ci, train_tree, tree_predict, opt_mean_tuning
import re

## Load data

In [None]:
data = pd.read_csv('datasets/stance_dataset.csv')
data = data.sample(frac=1).reset_index(drop=True) # shuffle data

affirming_devices = ['uncover', 'realize', 'know', 'understand', 'learn', 'concede',
'remember', 'recall', 'discover', 'show', 'reveal', 'see',
'forget', 'find', 'point out', 'indicate', 'acknowledge',
'admit', 'realize', 'notice', 'certify', 'verify', 'corroborate', 'affirm', 'confirm', 'agree', 'conclude',
'proven', 'settled', 'conclusive', 'definitive',
'famed', 'unequivocal', 'skilful', 'notable', 'strong', 'famous', 'Nobel', 'skillful',
'Nobelist', 'Nobel Laureate', 'Nobel prize winner',
'Nobel prize winning', 'prize winning', 'award',
'winning', 'distinguished', 'well-grounded', 'esteemed', 'proficient', 'key', 'evidence', 'noted', 'top',
'preeminent', 'breakthrough', 'significant', 'intelligent', 'of import', 'celebrated', 'novel', 'recent',
'major', 'landmark', 'important', 'distinguished',
'renowned', 'peer-reviewed', 'expert', 'leading',
'thousand', '1000', 'hundred', '100', 'unanimous', 'diverse',
'substantial', 'many', 'multiple', 'dozen', 'numerous']

escaped_terms = [re.escape(term) for term in affirming_devices]
pattern = '|'.join(escaped_terms)
data['contains_affirming_device'] = data['sentence'].str.contains(pattern, case=False, regex=True)

In [None]:
Yhat_string = data["label_gpt4o"].to_numpy()
confidence = data["confidence_in_prediction_gpt-4o"].to_numpy()
nan_indices = list(np.where(pd.isna(confidence))[0]) + list(np.where(pd.isna(Yhat_string))[0])
good_indices = list(set(range(len(data))) - set(nan_indices))
confidence = confidence[good_indices]
device = data['contains_affirming_device'].to_numpy()[good_indices]
Yhat_string = Yhat_string[good_indices]
Y_string = data["MACE_pred"].to_numpy()[good_indices]
n = len(Yhat_string)
dict = {"A" : 1, "B" : 0, "C" : 0, "agrees": 1, "neutral" : 0, "disagrees": 0}
Yhat = np.array([dict[Yhat_string[i]] for i in range(n)])
Y = np.array([dict[Y_string[i]] for i in range(n)])
confidence = confidence.reshape(len(confidence),1)

In [None]:
Y1 = Y[device]
Y0 = Y[~device]
Yhat1 = Yhat[device]
Yhat0 = Yhat[~device]
n0 = len(Y0)
n1 = len(Y1)

## Effective sample size and coverage

In [None]:
alpha = 0.1  # desired error level for confidence interval
fracs_human = np.linspace(0.2, 0.5, 10)  # overall sampling budget
num_trials = 100

mu0 = Y0.mean()
mu1 = Y1.mean()
true_odds_ratio = (mu1 / (1 - mu1)) / (mu0 / (1 - mu0))
true_var = (1/np.sum(Y0==0) + 1/np.sum(Y0==1) + 1/np.sum(Y1==0) + 1/np.sum(Y1==1))*n

num_methods = 4
temp_df = pd.DataFrame({
    "lb": np.zeros(num_methods),
    "ub": np.zeros(num_methods),
    "interval width": np.zeros(num_methods),
    "coverage": np.zeros(num_methods),
    "estimator": [""] * num_methods,
    "$n_{\mathrm{human}}$": np.zeros(num_methods),
    "$n_{\mathrm{effective}}$": np.zeros(num_methods)
})

burnin_steps = 50  # we collect the first burnin_steps points to initialize sampling rule
retrain_steps = 50  # every retrain_steps we retrain mapping from confidence to sampling probability
tau = 0.1  # parameter for mixing with uniform sampling for increased stability

results = []

for j in tqdm(range(len(fracs_human))):
    frac_human = fracs_human[j]
    frac_human_adjusted = (frac_human*n - burnin_steps)/(n - burnin_steps) # remove burnin_steps samples from available budget for both classes
    
    for i in tqdm(range(num_trials)):
        tree = train_tree(confidence[:burnin_steps], ((Y - Yhat)[:burnin_steps])**2)
        uncertainties = np.sqrt(tree_predict(tree, confidence))
        avg_uncertainty = np.mean(uncertainties)
        weights_active = np.zeros(n)
        sampling_ratio = np.zeros(n)
        weights_active[:burnin_steps] = 1
        
        for t in range(burnin_steps, n):
            if ((t-burnin_steps) % retrain_steps == 0):
                obs_inds = np.where(weights_active)
                tree = train_tree(confidence[obs_inds], ((Y - Yhat)[obs_inds])**2)
                uncertainties = np.sqrt(tree_predict(tree, confidence))
                avg_uncertainty = np.mean(uncertainties)

            sampling_prob = uncertainties[t]/avg_uncertainty*frac_human_adjusted
            sampling_prob = np.clip((1-tau)*sampling_prob + tau*frac_human_adjusted, 0, 1)
            sampling_ratio[t] = (1-sampling_prob)/sampling_prob
            weights_active[t] = bernoulli.rvs(sampling_prob)/sampling_prob
            
        weights_active0 = weights_active[~device]
        weights_active1 = weights_active[device]
        sampling_ratio0 = sampling_ratio[~device]
        sampling_ratio1 = sampling_ratio[device]

        lam0 = opt_mean_tuning(Y0, Yhat0, weights_active0, sampling_ratio0)
        lam1 = opt_mean_tuning(Y1, Yhat1, weights_active1, sampling_ratio1)
        l, u, varhat = odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, weights_active0, weights_active1, alpha, lhat0=lam0, lhat1=lam1)
        coverage = (true_odds_ratio >= l)*(true_odds_ratio <= u)
        temp_df.loc[0] = l, u, u-l, coverage, "confidence-driven", int(n*frac_human), (true_var/varhat)*n

        xi_unif0 = bernoulli.rvs([frac_human] * n0)
        xi_unif1 = bernoulli.rvs([frac_human] * n1)
        l, u, varhat = odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, xi_unif0/frac_human, xi_unif1/frac_human, alpha, lhat0=1, lhat1=1)
        coverage = (true_odds_ratio >= l)*(true_odds_ratio <= u)
        temp_df.loc[1] = l, u, u-l, coverage, "human + LLM (non-adaptive)", int(n*frac_human), (true_var/varhat)*n
        
        mu0 = Y0[np.where(xi_unif0)].mean()
        mu1 = Y1[np.where(xi_unif1)].mean()
        odds_ratio_est = np.log((mu1 / (1 - mu1)) / (mu0 / (1 - mu0)))
        varhat = (1/np.sum(Y0[np.where(xi_unif0)]==0) + 1/np.sum(Y0[np.where(xi_unif0)]==1) + 1/np.sum(Y1[np.where(xi_unif1)]==0) + 1/np.sum(Y1[np.where(xi_unif1)]==1))
        l = np.exp(odds_ratio_est - norm.ppf(1-alpha/2)*np.sqrt(varhat))
        u = np.exp(odds_ratio_est + norm.ppf(1-alpha/2)*np.sqrt(varhat))
        coverage = (true_odds_ratio >= l)*(true_odds_ratio <= u)
        temp_df.loc[2] = l, u, u-l, coverage, "human only", int(n*frac_human), (true_var/varhat)

        LLM_only_sample0 = np.random.choice(n0, n0, replace=True)
        LLM_only_sample1 = np.random.choice(n1, n1, replace=True)
        Yhat0_i = Yhat0[LLM_only_sample0]
        Yhat1_i = Yhat1[LLM_only_sample1]
        muhat0 = Yhat0_i.mean()
        muhat1 = Yhat1_i.mean()
        odds_ratio_est = (muhat1 / (1 - muhat1)) / (muhat0 / (1 - muhat0))
        varhat = (1/np.sum(Yhat0_i==0) + 1/np.sum(Yhat0_i==1) + 1/np.sum(Yhat1_i==0) + 1/np.sum(Yhat1_i==1))
        l = np.exp(odds_ratio_est - norm.ppf(1-alpha/2)*np.sqrt(varhat))
        u = np.exp(odds_ratio_est + norm.ppf(1-alpha/2)*np.sqrt(varhat))
        coverage = (true_odds_ratio >= l)*(true_odds_ratio <= u)
        temp_df.loc[3] = l, u, u-l, coverage, "LLM only", int(n*frac_human), 0
        
        results += [temp_df.copy()]
df = pd.concat(results,ignore_index=True)
df["coverage"] = df["coverage"].astype(bool)

In [None]:
plot_eff_sample_size(df, "plots/stance_ESS.pdf")

In [None]:
plot_coverage(df, alpha, "plots/stance_coverage.pdf")

In [None]:
plot_intervals(df, true_odds_ratio, num_trials, "odds ratio $O_{\mathrm{agreement}}$", "plots/stance_intervals.pdf", n_ind=0)

## Experiment at fixed $n_{\text{human}}$

In [None]:
alpha = 0.1  # desired error level for confidence interval
num_trials = 100
nhuman = 500
frac_human = nhuman/n

mu0 = Y0.mean()
mu1 = Y1.mean()
true_odds_ratio = (mu1 / (1 - mu1)) / (mu0 / (1 - mu0))
true_var = (1/np.sum(Y0==0) + 1/np.sum(Y0==1) + 1/np.sum(Y1==0) + 1/np.sum(Y1==1))*n

columns = ["estimator", "coverage", "effective sample size"]
num_methods = 3
temp_df = pd.DataFrame({
    "estimator": [""] * num_methods,
    "coverage": np.zeros(num_methods),
    "$n_{\mathrm{effective}}$": np.zeros(num_methods)
})


burnin_steps = 50  # we collect the first burnin_steps points to initialize sampling rule
retrain_steps = 50  # every retrain_steps we retrain mapping from confidence to sampling probability
tau = 0.1  # parameter for mixing with uniform sampling for increased stability

results = []


frac_human_adjusted = (nhuman - burnin_steps)/(n - burnin_steps) # remove burnin_steps samples from available budget for both classes

for i in tqdm(range(num_trials)):
    tree = train_tree(confidence[:burnin_steps], ((Y - Yhat)[:burnin_steps])**2)
    uncertainties = np.sqrt(tree_predict(tree, confidence))
    avg_uncertainty = np.mean(uncertainties)
    weights_active = np.zeros(n)
    sampling_ratio = np.zeros(n)
    weights_active[:burnin_steps] = 1
    
    for t in range(burnin_steps, n):
        if ((t-burnin_steps) % retrain_steps == 0):
            obs_inds = np.where(weights_active)
            tree = train_tree(confidence[obs_inds], ((Y - Yhat)[obs_inds])**2)
            uncertainties = np.sqrt(tree_predict(tree, confidence))
            avg_uncertainty = np.mean(uncertainties)

        sampling_prob = uncertainties[t]/avg_uncertainty*frac_human_adjusted
        sampling_prob = np.clip((1-tau)*sampling_prob + tau*frac_human_adjusted, 0, 1)
        sampling_ratio[t] = (1-sampling_prob)/sampling_prob

        weights_active[t] = bernoulli.rvs(sampling_prob)/sampling_prob
        
    weights_active0 = weights_active[~device]
    weights_active1 = weights_active[device]
    sampling_ratio0 = sampling_ratio[~device]
    sampling_ratio1 = sampling_ratio[device]

    lam0 = opt_mean_tuning(Y0, Yhat0, weights_active0, sampling_ratio0)
    lam1 = opt_mean_tuning(Y1, Yhat1, weights_active1, sampling_ratio1)
    l, u, varhat = odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, weights_active0, weights_active1, alpha, lhat0=lam0, lhat1=lam1)
    coverage = (true_odds_ratio >= l)*(true_odds_ratio <= u)
    temp_df.loc[0] = "confidence-driven", coverage, (true_var/varhat)*n

    xi_unif0 = bernoulli.rvs([frac_human] * n0)
    xi_unif1 = bernoulli.rvs([frac_human] * n1)
    l, u, varhat = odds_ratio_ci(Y0, Yhat0, Y1, Yhat1, xi_unif0/frac_human, xi_unif1/frac_human, alpha, lhat0=1, lhat1=1)
    coverage = (true_odds_ratio >= l)*(true_odds_ratio <= u)
    temp_df.loc[1] = "human + LLM (non-adaptive)", coverage, (true_var/varhat)*n

    LLM_only_sample0 = np.random.choice(n0, n0, replace=True)
    LLM_only_sample1 = np.random.choice(n1, n1, replace=True)
    Yhat0_i = Yhat0[LLM_only_sample0]
    Yhat1_i = Yhat1[LLM_only_sample1]
    muhat0 = Yhat0_i.mean()
    muhat1 = Yhat1_i.mean()
    odds_ratio_est = (muhat1 / (1 - muhat1)) / (muhat0 / (1 - muhat0))
    varhat = (1/np.sum(Yhat0_i==0) + 1/np.sum(Yhat0_i==1) + 1/np.sum(Yhat1_i==0) + 1/np.sum(Yhat1_i==1))
    l = np.exp(odds_ratio_est - norm.ppf(1-alpha/2)*np.sqrt(varhat))
    u = np.exp(odds_ratio_est + norm.ppf(1-alpha/2)*np.sqrt(varhat))
    coverage = (true_odds_ratio >= l)*(true_odds_ratio <= u)
    temp_df.loc[2] = "LLM only", coverage, 0
    
    results += [temp_df.copy()]
df = pd.concat(results,ignore_index=True)
df["ESS gain"] = df["$n_{\mathrm{effective}}$"]/nhuman - 1
df["coverage"] = df["coverage"].astype(bool)