In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from scipy.stats import norm, bernoulli
import json
import warnings; warnings.simplefilter('ignore')
from tqdm.auto import tqdm
from plotting_utils import plot_naive_coverage, plot_eff_sample_size, plot_coverage, plot_intervals
from utils import logistic, logistic_cov, active_logistic_pointestimate, opt_logistic_tuning, train_tree, tree_predict, inv_hessian_col_imputed

## Load data

In [None]:
data = pd.read_csv('datasets/politeness_dataset.csv')
data = data.sample(frac=1).reset_index(drop=True) # shuffle data
Yhat_string = data["label_gpt4o"].to_numpy()
confidence = data["confidence_in_prediction_gpt-4o"].to_numpy()
device = 'hedge' # 'hedge' or '1pp'
nan_indices = list(np.where(pd.isna(confidence))[0]) + list(np.where(pd.isna(Yhat_string))[0])
good_indices = list(set(range(len(data))) - set(nan_indices))
confidence = confidence[good_indices]
Yhat_string = Yhat_string[good_indices]
Y = data["Politeness"].to_numpy()[good_indices]
n = len(Y)
dict = {"A" : 1, "B" : 0}
Yhat = np.array([dict[Yhat_string[i]] for i in range(n)])
if device == 'hedge':
    X_device = data.to_numpy()[:,[4]][good_indices]
elif device == '1pp':
    X_device = data.to_numpy()[:,[11]][good_indices]
X = np.column_stack((X_device, np.ones(n))).astype(float)
confidence = confidence.reshape(len(confidence),1)

## Effective sample size and coverage

In [None]:
alpha = 0.1  # desired error level for confidence interval
fracs_human = np.linspace(0.05, 0.2, 10)  # overall sampling budget
num_trials = 100

true_pointest = logistic(X,Y)
true_effect = true_pointest[0]  # first coordinate is effect of device
true_Sigma = logistic_cov(true_pointest, X, Y, Yhat, np.ones(n), lam=0)

num_methods = 4
temp_df = pd.DataFrame({
    "lb": np.zeros(num_methods),
    "ub": np.zeros(num_methods),
    "interval width": np.zeros(num_methods),
    "coverage": np.zeros(num_methods),
    "estimator": [""] * num_methods,
    "$n_{\mathrm{human}}$": np.zeros(num_methods),
    "$n_{\mathrm{effective}}$": np.zeros(num_methods)
})


burnin_steps = 50  # we collect the first burnin_steps points to initialize sampling rule
retrain_steps = 200  # every retrain_steps we retrain mapping from confidence to sampling probability
tau = 0.1  # parameter for mixing with uniform sampling for increased stability

# compute column of inverse Hessian with LLM labels for approximating optimal sampling rule
h = inv_hessian_col_imputed(X, Yhat)

results = []

for j in tqdm(range(len(fracs_human))):
    frac_human = fracs_human[j]
    frac_human_adjusted = (frac_human*n - burnin_steps)/(n - burnin_steps) # remove burnin_steps samples from available budget
    
    for i in tqdm(range(num_trials)):
        tree = train_tree(confidence[:burnin_steps], ((Y - Yhat)[:burnin_steps])**2)
        uncertainties = np.sqrt(tree_predict(tree, confidence)) * np.abs(X.dot(h))
        avg_uncertainty = np.mean(uncertainties)
        weights_active = np.zeros(n)
        weights_active[:burnin_steps] = 1
        
        for t in range(burnin_steps, n):
            
            if ((t-burnin_steps) % retrain_steps == 0):
                obs_inds = np.where(weights_active)
                tree = train_tree(confidence[obs_inds], ((Y - Yhat)[obs_inds])**2)
                uncertainties = np.sqrt(tree_predict(tree, confidence)) * np.abs(X.dot(h))
                avg_uncertainty = np.mean(uncertainties)

            sampling_prob = uncertainties[t]/avg_uncertainty*frac_human_adjusted
            sampling_prob = np.clip((1-tau)*sampling_prob + tau*frac_human_adjusted, 0, 1)
            weights_active[t] = bernoulli.rvs(sampling_prob)/sampling_prob
            
        pointest_init = active_logistic_pointestimate(X, Y, Yhat, weights_active, lam=1)
        lam = opt_logistic_tuning(pointest_init, X, Y, Yhat, weights_active)
        pointest = active_logistic_pointestimate(X, Y, Yhat, weights_active, lam=lam)
        Sigmahat = logistic_cov(pointest, X, Y, Yhat, weights_active, lam=lam)
        l = pointest - norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        u = pointest + norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        coverage = (true_effect >= l[0])*(true_effect <= u[0])
        temp_df.loc[0] = l[0], u[0], u[0]-l[0], coverage, "confidence-driven", int(n*frac_human), (true_Sigma[0,0]/Sigmahat[0,0])*n
        
        xi_unif = bernoulli.rvs([frac_human] * n)
        pointest = active_logistic_pointestimate(X, Y, Yhat, xi_unif/frac_human, lam=1)
        Sigmahat = logistic_cov(pointest, X, Y, Yhat, xi_unif/frac_human, lam=1)
        l = pointest - norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        u = pointest + norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        coverage = (true_effect >= l[0])*(true_effect <= u[0])
        temp_df.loc[1] = l[0], u[0], u[0]-l[0], coverage, "human + LLM (non-adaptive)", int(n*frac_human), (true_Sigma[0,0]/Sigmahat[0,0])*n
        
        pointest = logistic(X[np.where(xi_unif)], Y[np.where(xi_unif)])
        Sigmahat = logistic_cov(pointest, X, Y, Yhat, xi_unif/frac_human, lam=0)
        l = pointest - norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        u = pointest + norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        coverage = (true_effect >= l[0])*(true_effect <= u[0])
        temp_df.loc[2] = l[0], u[0], u[0]-l[0], coverage, "human only", int(n*frac_human), (true_Sigma[0,0]/Sigmahat[0,0])*n

        LLM_only_sample = np.random.choice(n, n, replace=True)
        pointest = logistic(X[LLM_only_sample], Yhat[LLM_only_sample])
        Sigmahat = logistic_cov(pointest, X[LLM_only_sample], Yhat[LLM_only_sample], Yhat[LLM_only_sample], np.ones(n), lam=0)
        l = pointest - norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        u = pointest + norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
        coverage = (true_effect >= l[0])*(true_effect <= u[0])
        temp_df.loc[3] = l[0], u[0], u[0]-l[0], coverage, "LLM only", int(n*frac_human), 0

        results += [temp_df.copy()]
df = pd.concat(results,ignore_index=True)
df["coverage"] = df["coverage"].astype(bool)

In [None]:
plot_eff_sample_size(df, "plots/" + device + "_ESS.pdf")

In [None]:
plot_coverage(df, alpha, "plots/" + device + "_coverage.pdf")

In [None]:
plot_intervals(df, true_effect, num_trials, "logistic regression coefficient β$_{\mathrm{" + device + "}}$", "plots/" + device + "_intervals.pdf", n_ind=0)

## Experiment at fixed $n_{\text{human}}$

In [None]:
alpha = 0.1  # desired error level for confidence interval
num_trials = 100
nhuman = 500
frac_human = nhuman/n

true_pointest = logistic(X,Y)
true_effect = true_pointest[0]  # first coordinate is for effect of marker
true_Sigma = logistic_cov(true_pointest, X, Y, Yhat, np.ones(n), lam=0)

num_methods = 3
temp_df = pd.DataFrame({
    "estimator": [""] * num_methods,
    "coverage": np.zeros(num_methods),
    "$n_{\mathrm{effective}}$": np.zeros(num_methods)
})

burnin_steps = 50  # we collect the first burnin_steps points to initialize sampling rule
retrain_steps = 200  # every retrain_steps we retrain mapping from confidence to sampling probability
tau = 0.1  # parameter for mixing with uniform sampling for increased stability

# compute column of inverse Hessian with LLM labels for approximating optimal sampling rule
h = inv_hessian_col_imputed(X, Yhat)

results = []


frac_human_adjusted = (nhuman - burnin_steps)/(n - burnin_steps) # remove burnin_steps samples from available budget

for i in tqdm(range(num_trials)):
    tree = train_tree(confidence[:burnin_steps], ((Y - Yhat)[:burnin_steps])**2)
    uncertainties = np.sqrt(tree_predict(tree,confidence)) * np.abs(X.dot(h))
    avg_uncertainty = np.mean(uncertainties)
    weights_active = np.zeros(n)
    weights_active[:burnin_steps] = 1
    
    for t in range(burnin_steps, n):
        
        if ((t-burnin_steps) % retrain_steps == 0):
            obs_inds = np.where(weights_active)
            tree = train_tree(confidence[obs_inds], ((Y - Yhat)[obs_inds])**2)
            uncertainties = np.sqrt(tree_predict(tree, confidence)) * np.abs(X.dot(h))
            avg_uncertainty = np.mean(uncertainties)

        sampling_prob = uncertainties[t]/avg_uncertainty*frac_human_adjusted
        sampling_prob = np.clip((1-tau)*sampling_prob + tau*frac_human_adjusted, 0, 1)
        weights_active[t] = bernoulli.rvs(sampling_prob)/sampling_prob

        
    pointest_init = active_logistic_pointestimate(X, Y, Yhat, weights_active, lam=1)
    lam = opt_logistic_tuning(pointest_init, X, Y, Yhat, weights_active)
    pointest = active_logistic_pointestimate(X, Y, Yhat, weights_active, lam=lam)
    Sigmahat = logistic_cov(pointest, X, Y, Yhat, weights_active, lam=lam)
    l = pointest - norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
    u = pointest + norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
    coverage = (true_effect >= l[0])*(true_effect <= u[0])
    temp_df.loc[0] =  "confidence-driven", coverage, (true_Sigma[0,0]/Sigmahat[0,0])*n
    
    xi_unif = bernoulli.rvs([frac_human] * n)
    pointest = active_logistic_pointestimate(X, Y, Yhat, xi_unif/frac_human, lam=1)
    Sigmahat = logistic_cov(pointest, X, Y, Yhat, xi_unif/frac_human, lam=1)
    l = pointest - norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
    u = pointest + norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
    coverage = (true_effect >= l[0])*(true_effect <= u[0])
    temp_df.loc[1] = "human + LLM (non-adaptive)", coverage, (true_Sigma[0,0]/Sigmahat[0,0])*n

    LLM_only_sample = np.random.choice(n, n, replace=True)
    pointest = logistic(X[LLM_only_sample], Yhat[LLM_only_sample])
    Sigmahat = logistic_cov(pointest, X[LLM_only_sample], Yhat[LLM_only_sample], Yhat[LLM_only_sample], np.ones(n), lam=0)
    l = pointest - norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
    u = pointest + norm.ppf(1-alpha/2)*np.sqrt(np.diag(Sigmahat)/n)
    coverage = (true_effect >= l[0])*(true_effect <= u[0])
    temp_df.loc[2] = "LLM only", coverage, 0

    results += [temp_df.copy()]
df = pd.concat(results,ignore_index=True)
df["ESS gain"] = df["$n_{\mathrm{effective}}$"]/nhuman - 1
df["coverage"] = df["coverage"].astype(bool)