### Import libraries

In [1]:
import numpy as np
from scipy.stats import norm, bernoulli
import pprint
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm
import time
from ppi_py.utils import bootstrap
import re
import openai
import requests
import json
from datetime import datetime, timezone
import time
import zipfile
import io
import random
from sklearn.linear_model import LogisticRegression
%load_ext autoreload
%autoreload 2

In [2]:
from utils import llms, qualtrics, prolific, mturk, inference
from utils.llms import annotate_texts_with_llm, collect_llm_confidence
from utils.qualtrics import create_and_activate_surveys
from utils.prolific import run_prolific_annotation_pipeline
from utils.mturk import run_mturk_annotation_pipeline
from utils.inference import train_sampling_rule, sampling_rule_predict, confidence_driven_inference

### Set parameters for Confidence-Driven Inference (CDI)

In [3]:
# if true, we collect LLM annotations and human annotations from scratch, if false, load pre-collected ones
COLLECT_LLM = False
COLLECT_HUMAN = False

# if COLLECT_HUMAN = True, specify whether to use "Prolific" or "MTURK"
HUMAN_SOURCE = "MTURK"

alpha = 0.1  # desired error level for confidence interval
burnin_steps = 50  # we collect the first burnin_steps points to initialize sampling rule

n_batches = 4

tau = 0.1  # parameter for mixing with uniform sampling for increased stability
n_human = 200 # budget on number of human annotations (including burnin_steps)

N = 1000 # corpus size, or the size of the random subset of the corpus that will be annotated with an LLM
random_state = 42

### Set API keys (needed only if COLLECT_LLM or COLLECT_HUMAN are True)

In [4]:
# Set your Open API key
OPENAI_API_KEY = ""

# Set your Qualtrics API key and url here
QUALTRICS_API_KEY = ""

# Set your Qualtrics url here (e.g., https://stanforduniversity.qualtrics.com/API/v3)
QUALTRICS_API_URL = ""

# Set your Prolific API key here
PROLIFIC_API_KEY = ""

# Set your MTURK API key here
AWS_ACCESS_KEY_ID = ""

# Set your MTURK API secret access key here
AWS_SECRET_ACCESS_KEY = ""

### Set parameters for LLM annotation

In [5]:
prompt = """Is the following text polite? Output either A or B. Output a letter only.
A) Polite
B) Impolite

Text: """

#category numerically coded as 1
positive_class = 'Polite'

mapping_categories = {
"A": "Polite",
"B": "Impolite"
}

model = "gpt-4o-mini-2024-07-18"
sleep = 0.5
temperature = 0

### Set parameters for human annotation

In [6]:
## Task parameters
task_title = "Is the following text polite?"
annotation_instruction = "Is the following text polite?"
task_description = "Please annotate the politeness of the provided text. This is a real-time study where we're hoping to get immediate annotations."
categories = [
    "Polite",
    "Impolite"
]

## Additional Prolific settings
BASE_URL = "https://api.prolific.com/api/v1"
HEADERS = {
    "Authorization": f"Token {PROLIFIC_API_KEY}",
    "Content-Type": "application/json",
}
reward = 80
estimated_time = 1
maximum_allowed_time = 30 # How long a single annotator can take
BATCH_TIMEOUT = 30 # How long we'll wait for the resonses (in minutes) before moving on (replaces not collected annotations with np.nan)

## Additional MTURK settings
task_reward = '0.8' 
minimum_approval_rate = 99 # >98% reccommended
minimum_tasks_approved = 0 # >5000 reccommended
annotation_instructions = {"question": task_description,
    "options": set(categories)}

### Step 1: Load the texts and collect LLM annotations

#### Texts will be annotated for their politeness. We will showcase the estimation of two target statistics:
1. Prevalence of the politeness $mean(Y)$, i.e., the fraction of texts in the corpus that are polite
2. The impact of text-based feature, i.e., presence of hedging ($X$), on the perceived politeness ($Y$), estimated with a logistic regression

In [7]:
start_time_data_collection = time.time()

# Load the text corpus; we need two columns: texts and the text-based feature we will use for inference
# Text-based feature used for inference in this example is the presence of hedging, stored in "Feature_3" column
text_based_feature = 'Feature_3'

df = pd.read_csv('data/politeness_dataset.csv')[['Text',text_based_feature]]
df = df.sample(n = N, random_state = random_state)

In [8]:
n = len(df)
data = pd.DataFrame()
data['human'] = [np.nan]*(n)
data['llm'] = [np.nan]*(n)
data['llm_conf'] = [np.nan]*(n)
data['X'] = df[text_based_feature].values
data['text'] = df['Text'].values

In [9]:
if COLLECT_LLM:
    #collect annotations
    sample_texts = annotate_texts_with_llm(texts = data['text'].values,
                                       model = model,
                                       prompt = prompt,
                                       mapping_categories = mapping_categories,
                                       sleep = sleep,
                                       temperature = temperature,
                                       OPENAI_API_KEY = OPENAI_API_KEY)
    #collect verbalized confidence
    sample_texts = collect_llm_confidence(sample_texts = sample_texts,
                                       model = model,
                                       sleep = sleep,
                                       temperature = temperature,
                                       OPENAI_API_KEY = OPENAI_API_KEY)

    
    data['llm'] = sample_texts['LLM_annotation'].apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
    data['llm_conf'] = sample_texts['confidence_in_prediction']
else:
    #load the extisting annotations we already collected
    df['Prediction_gpt-4o'] = pd.read_csv('data/politeness_dataset.csv')['Prediction_gpt-4o'].sample(n = N, random_state = random_state).values
    df['Confidence_gpt-4o'] = pd.read_csv('data/politeness_dataset.csv')['Confidence_gpt-4o'].sample(n = N, random_state = random_state).values
    data['llm'] = df['Prediction_gpt-4o'].apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
    data['llm_conf'] = df['Confidence_gpt-4o'].values

### Step 2: Collect human annotations for burnin steps

In [10]:
if COLLECT_HUMAN:
    texts_to_annotate = list(data.loc[:burnin_steps-1,'text'].values)

    if HUMAN_SOURCE == "Prolific":
    
        # create Qualtrics annotation interface and get annotation task URLs
        survey_links = create_and_activate_surveys(
            texts_to_annotate=texts_to_annotate,
            categories=categories,
            annotation_instruction=annotation_instruction,
            QUALTRICS_API_URL=QUALTRICS_API_URL,
            QUALTRICS_API_KEY=QUALTRICS_API_KEY)
        
        # run the Prolific annotation pipeline
        annotations = run_prolific_annotation_pipeline(
            survey_links=list(survey_links.values()),
            name_prefix=task_title,
            description=task_description,
            reward = reward,  
            estimated_time=estimated_time,
            max_time = maximum_allowed_time,
            HEADERS = HEADERS,
            BASE_URL = BASE_URL,
            QUALTRICS_API_URL = QUALTRICS_API_URL,
            QUALTRICS_API_KEY = QUALTRICS_API_KEY,
            BATCH_TIMEOUT = BATCH_TIMEOUT
        )

    if HUMAN_SOURCE == "MTURK":
        annotations = run_mturk_annotation_pipeline(pd.DataFrame(texts_to_annotate, columns=['Text']),
                                            annotation_instructions = annotation_instructions,
                                            task_title = task_title,
                                            task_description = task_description,
                                            task_reward = task_reward,
                                            minimum_approval_rate = minimum_approval_rate,
                                            minimum_tasks_approved = minimum_tasks_approved,
                                            aws_access_key_id = AWS_ACCESS_KEY_ID,
                                            aws_secret_access_key = AWS_SECRET_ACCESS_KEY)
        
    data.loc[:burnin_steps-1,'human'] = pd.Series(annotations).apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
else:
    #load the extisting annotations we already collected
    df['Prediction_human'] = pd.read_csv('data/politeness_dataset.csv').sample(n = N, random_state = random_state)['Prediction_human'].values
    df = df['Prediction_human'].reset_index()

    #initialize the first burnin_steps annotations
    data.loc[:burnin_steps-1,'human'] = df['Prediction_human'].values[:burnin_steps]

### Step 2: Initialize the sampling rule

In [11]:
confidence = data['llm_conf'].to_numpy().reshape((n,1))
confidence_burnin = confidence[:burnin_steps]
H = data['human'].to_numpy()
H_burnin = H[:burnin_steps]
Hhat = data['llm'].to_numpy()
Hhat_burnin = Hhat[:burnin_steps]
SP = np.zeros(n)
SD = np.zeros(n)
SP[:burnin_steps] = np.ones(burnin_steps)
SD[:burnin_steps] = np.ones(burnin_steps)
sampling_rule = train_sampling_rule(confidence_burnin, (H_burnin - Hhat_burnin)**2) # trains XGBoost model
sampling_probs_unnormed = np.clip(np.sqrt(sampling_rule_predict(sampling_rule, confidence)), 1e-4, 1)
avg_sampling_probs = np.mean(sampling_probs_unnormed)
frac_human_adjusted = (n_human - burnin_steps)/(n - burnin_steps) # remove burnin_steps samples from available budget

### Step 3: In batches, strategically sample texts for human annotation

In [12]:
batch_size = (n - burnin_steps)//n_batches
for b in range(n_batches):
    if b < (n_batches - 1):
        batch_inds = np.array(range(burnin_steps + b*batch_size, burnin_steps + (b+1)*batch_size))
    else:
        batch_inds = np.array(range(burnin_steps + b*batch_size, n))
        
    sampling_probs = sampling_probs_unnormed[batch_inds]/avg_sampling_probs*frac_human_adjusted
    sampling_probs = np.clip((1-tau)*sampling_probs + tau*frac_human_adjusted, 0, 1)

    if np.isnan(sampling_probs).all():
        print(f"Training the model failed at batch {b+1}/{n_batches}... Quitting.")
        break
        
    labeling_decisions = bernoulli.rvs(sampling_probs)
    indices_to_label = batch_inds[np.where(labeling_decisions)]

    print()
    print(f"Collecting batch {b+1}/{n_batches}...")
    if COLLECT_HUMAN:
        texts_to_annotate = list(data.loc[indices_to_label,'text'].values)

        if HUMAN_SOURCE == "Prolific":
            # create Qualtrics annotation interface and get annotation task URLs
            survey_links = create_and_activate_surveys(
                texts_to_annotate=texts_to_annotate,
                categories=categories,
                annotation_instruction=annotation_instruction,
                QUALTRICS_API_URL=QUALTRICS_API_URL,
                QUALTRICS_API_KEY=QUALTRICS_API_KEY)
            
            # run the Prolific annotation pipeline
            annotations = run_prolific_annotation_pipeline(
                survey_links=list(survey_links.values()),
                name_prefix=task_title,
                description=task_description,
                reward = reward,  
                estimated_time=estimated_time,
                max_time = maximum_allowed_time,
                HEADERS = HEADERS,
                BASE_URL = BASE_URL,
                QUALTRICS_API_URL = QUALTRICS_API_URL,
                QUALTRICS_API_KEY = QUALTRICS_API_KEY,
                BATCH_TIMEOUT = BATCH_TIMEOUT
            )

        if HUMAN_SOURCE == "MTURK":
            annotations = run_mturk_annotation_pipeline(pd.DataFrame(texts_to_annotate, columns=['Text']),
                                            annotation_instructions = annotation_instructions,
                                            task_title = task_title,
                                            task_description = task_description,
                                            task_reward = task_reward,
                                            minimum_approval_rate = minimum_approval_rate,
                                            minimum_tasks_approved = minimum_tasks_approved,
                                            aws_access_key_id = AWS_ACCESS_KEY_ID,
                                            aws_secret_access_key = AWS_SECRET_ACCESS_KEY)

        H[indices_to_label] = pd.Series(annotations).apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
    else: 
        H[indices_to_label] = df['Prediction_human'].iloc[list(indices_to_label)]
        print(f"Collecting {len(df['Prediction_human'].iloc[list(indices_to_label)])} human annotations.") 

    collected_inds = np.where(labeling_decisions)

    SP[batch_inds] = sampling_probs
    SD[batch_inds] = labeling_decisions
    
    if b < (n_batches - 1):
        sampling_rule = train_sampling_rule(confidence[collected_inds], (H[collected_inds] - Hhat[collected_inds])**2)
        sampling_probs_unnormed = np.clip(np.sqrt(sampling_rule_predict(sampling_rule, confidence)), 1e-4, 1)
        avg_sampling_probs = np.mean(sampling_probs_unnormed)


Collecting batch 1/4...
Collecting 33 human annotations.

Collecting batch 2/4...
Collecting 43 human annotations.

Collecting batch 3/4...
Collecting 38 human annotations.

Collecting batch 4/4...
Collecting 41 human annotations.


In [13]:
data.loc[list(collected_inds[0]),'human'] = H[list(collected_inds)][0]
data['sampling_probs'] = SP
data['sampling_decisions'] = SD
print(f"{len(data.dropna(subset = ['human']))} human datapoints collected in total.")

205 human datapoints collected in total.


### Step 4: Compute the estimate and confidence interval
#### We showcase estimation of two target statistics:
1. $mean(Y)$: prevalence of the politeness, i.e., the fraction of texts in the corpus that are polite
2. $\beta_{hedge}$: the impact of linguistic features of hedging ($X$) on the perceived politeness ($Y$), estimated with a logistic regression

In [14]:
#define the estimator function
#mask for valid labels and specify how to use weights

def mean_estimator(y, weights):
    y, weights = y[~np.isnan(y)], weights[~np.isnan(y)]
    return np.sum(y * weights) / np.sum(weights)

def log_reg_estimator(X, y, weights):
    X, y, weights = X[~np.isnan(y)], y[~np.isnan(y)], weights[~np.isnan(y)]
    return LogisticRegression(solver="liblinear").fit(X, y, sample_weight=weights).coef_[0, 0]

#### $mean(Y)$ estimation

In [15]:
lower_bound, upper_bound = confidence_driven_inference(
    estimator = mean_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    sampling_probs =  data['sampling_probs'].values,
    sampling_decisions = data['sampling_decisions'].values)

print("CDI estimate of the target statistic (mean(Y): prevalence of politeness):")
print(lower_bound.round(4), upper_bound.round(4))

CDI estimate of the target statistic (mean(Y): prevalence of politeness):
0.3993 0.5256


In [16]:
print("Ground truth mean(Y) estimate (if we had access to human annotations on the full text corpus):")
print(np.mean(pd.read_csv('data/politeness_dataset.csv').sample(n = N, random_state = random_state)['Prediction_human'].values))

Ground truth mean(Y) estimate (if we had access to human annotations on the full text corpus):
0.488


#### $\beta_{hedge}$ estimation

In [17]:
lower_bound, upper_bound = confidence_driven_inference(
    estimator = log_reg_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    X = data['X'].values.reshape(-1, 1),
    sampling_probs =  data['sampling_probs'].values,
    sampling_decisions = data['sampling_decisions'].values)

print("CDI estimate of the target statistic (β: effect of hedging on politeness):")
print(lower_bound.round(4), upper_bound.round(4))

CDI estimate of the target statistic (β: effect of hedging on politeness):
-0.0749 1.1171


In [18]:
print("Ground truth β estimate (if we had access to human annotations on the full text corpus):")
y = pd.read_csv('data/politeness_dataset.csv').sample(n = N, random_state = random_state)['Prediction_human'].values
X = pd.read_csv('data/politeness_dataset.csv').sample(n = N, random_state = random_state)[text_based_feature].values

print(round(LogisticRegression(solver="liblinear").fit(X.reshape(-1, 1),y).coef_[0, 0],4))

Ground truth β estimate (if we had access to human annotations on the full text corpus):
0.2467


In [19]:
data.head()

Unnamed: 0,human,llm,llm_conf,X,text,sampling_probs,sampling_decisions
0,1.0,1,0.95,1,thank you for that. Do you think it is possibl...,1.0,1.0
1,0.0,0,0.1,0,Notability is an issue of common sense: you're...,1.0,1.0
2,0.0,0,0.2,1,Too vague - explain what you expect the conten...,1.0,1.0
3,0.0,1,0.65,0,I am the third party; <url> was the first to d...,1.0,1.0
4,1.0,1,0.95,0,Interesting problem. What have you tried so f...,1.0,1.0


In [20]:
elapsed_minutes = (time.time() - start_time_data_collection) / 60
print(f"Total data collection time: {elapsed_minutes:.2f} minutes.")

Total data collection time: 0.13 minutes.
