### This is a non-adaptive version of the tutorial, where we collect all the human annotations in a single batch
- Texts are randomly selected for human annotations, according to the budget
- We then leverage the verbalized confidence scores and limited human annotations for valid statistical inference
- We assume human annotations, llm annotations, and llm confidence have already been collected

### Import libraries

In [1]:
import numpy as np
from scipy.stats import norm, bernoulli
import pprint
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm
import time
from ppi_py.utils import bootstrap
import re
import json
from datetime import datetime, timezone
import time
import zipfile
import io
import random
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
%load_ext autoreload
%autoreload 2
from utils import inference
from utils.inference import train_sampling_rule, sampling_rule_predict, confidence_driven_inference

### Set your file parameters and needed columns

1. The file path
2. Human labels (textual labels such as "positive" or "negative", or "polite" or "impolite")
3. LLM labels (textual labels such as "positive" or "negative", or "polite" or "impolite")
4. LLM confidence (a number between 0 and 1)
5. A text-based feature (0 or 1 indicating presence or absence of a textual feature)
6. What class will be considered as positive (the class we are focusing on)

In [2]:
### Replace with your path and your column names

# 1. The file path
dataset = 'data/politeness_dataset_human_incomplete.csv'

# The name of the column with human labels
human_labels = 'Prediction_human'

# The name of the column with LLM labels
llm_labels = 'Prediction_gpt-4o'

# The name of the column with LLM confidence
llm_confidence = 'Confidence_gpt-4o'

# The name of the column with a text-based feature
text_based_feature = 'Feature_3'

# What class will be considered as positive (the class we are focusing on)
positive_class = 'Polite'

### Set parameters for Confidence-Driven Inference (CDI)

In [3]:
alpha = 0.1  # desired error level for confidence interval

### Step 1: Load the texts and LLM annotations

#### We will showcase the estimation of two target statistics:
1. $mean(Y)$, i.e., the fraction of texts in the corpus that are labeled as the positive class
2. The impact of text-based feature ($X$), on the positive class annotation ($Y$), estimated with a logistic regression

In [4]:
N = len(pd.read_csv(dataset))
data = pd.DataFrame()

#load the text-based feature
data['X'] = pd.read_csv(dataset)[text_based_feature].values

#load the extisting LLM annotations we already collected
data['llm'] = pd.read_csv(dataset)[llm_labels].apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
data['llm_conf'] = pd.read_csv(dataset)[llm_confidence].values

### Step 2: Load human annotations (in a single batch)

In [5]:
#load the extisting human annotations we already collected
data['human'] = pd.read_csv(dataset)[[human_labels]].values

n_human = sum(~data['human'].isna())
print(f"{len(data.dropna(subset = ['human']))} human datapoints collected in total.")

2000 human datapoints collected in total.


### Step 3: Based on LLM's confidence and limited human annotations, learn the weights for all the texts

In [22]:
confidence = data['llm_conf'].to_numpy().reshape((N,1))
confidence_human = confidence[~data['human'].isna()]

H = data['human'].to_numpy()
H_human = H[~data['human'].isna()]

Hhat = data['llm'].to_numpy()
Hhat_human = Hhat[~data['human'].isna()]

data['sampling_decisions'] = (~data['human'].isna()).astype(int)

sampling_rule = train_sampling_rule(confidence_human, (H_human - Hhat_human)**2)
sampling_probs_unnormed = np.clip(np.sqrt(sampling_rule_predict(sampling_rule, confidence)), 1e-4, 1)
data['sampling_probs'] = sampling_probs_unnormed / sum(sampling_probs_unnormed)

### Step 4: Compute the estimate and confidence interval

In [23]:
#define the estimator function
#mask for valid labels and specify how to use weights

def mean_estimator(y, weights):
    y, weights = y[~np.isnan(y)], weights[~np.isnan(y)]
    return np.sum(y * weights) / np.sum(weights)

def log_reg_estimator(X, y, weights):
    X, y, weights = X[~np.isnan(y)], y[~np.isnan(y)], weights[~np.isnan(y)]
    return LogisticRegression(solver="liblinear").fit(X, y, sample_weight=weights).coef_[0, 0]

#### $mean(Y)$ estimation

In [24]:
lower_bound, upper_bound = confidence_driven_inference(
    estimator = mean_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    sampling_probs =  data['sampling_probs'].values,
    sampling_decisions = data['sampling_decisions'].values)

print("CDI estimate of the target statistic (mean(Y)):")
print(lower_bound.round(4), upper_bound.round(4))

CDI estimate of the target statistic (mean(Y)):
0.565 0.5974


#### $\beta$ estimation

In [25]:
lower_bound, upper_bound = confidence_driven_inference(
    estimator = log_reg_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    X = data['X'].values.reshape(-1, 1),
    sampling_probs =  data['sampling_probs'].values,
    sampling_decisions = data['sampling_decisions'].values)

print("CDI estimate of the target statistic (β: effect of X on Y):")
print(lower_bound.round(4), upper_bound.round(4))

CDI estimate of the target statistic (β: effect of X on Y):
0.3305 0.6436
