## Now, let's collect some fresh LLM annotations & human annotations (by annotating ourselves)!

### We need to specify the follwoing parameters below, in code:

### Changes to the parameters are the following:
1. COLLECT_LLM = True
2. COLLECT_HUMAN = True

### We will also reduce the number of samples so that the annotation doesn't take long:
1. N = 100
2. burnin_steps = 5
3. n_batches = 2
4. n_human = 15
5. sleep = 0.1

### And, we will load the OpenAI key and the Mturk keys:
- If you don't have those set up, just follow along!
- Otherwise, fill in these three keys:
    1. OPENAI_API_KEY = ""
    2. AWS_ACCESS_KEY_ID = ""
    3. AWS_SECRET_ACCESS_KEY = ""

In [1]:
def load_credentials(file_path="credentials.txt"):
    credentials = {}
    with open(file_path, "r") as f:
        for line in f:
            if '=' in line:
                key, value = line.strip().split('=', 1)
                credentials[key.strip()] = value.strip()
    return credentials

# Load credentials
creds = load_credentials()

#put your key here
AWS_ACCESS_KEY_ID = creds.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = creds.get("AWS_SECRET_ACCESS_KEY")
OPENAI_API_KEY = creds.get("OPENAI_API_KEY")

### Import libraries

In [2]:
import numpy as np
from scipy.stats import norm, bernoulli
import pprint
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm
import time
from ppi_py.utils import bootstrap
import re
import openai
import requests
import json
from datetime import datetime, timezone
import time
import zipfile
import io
import random
from sklearn.linear_model import LogisticRegression
%load_ext autoreload
%autoreload 2

In [3]:
from utils import llms, qualtrics, prolific, mturk, inference
from utils.llms import annotate_texts_with_llm, collect_llm_confidence
from utils.qualtrics import create_and_activate_surveys
from utils.prolific import run_prolific_annotation_pipeline
from utils.mturk import run_mturk_annotation_pipeline
from utils.inference import train_sampling_rule, sampling_rule_predict, confidence_driven_inference

### Set parameters for Confidence-Driven Inference (CDI)

In [4]:
# if true, we collect LLM annotations and human annotations from scratch, if false, load pre-collected ones
COLLECT_LLM = True
COLLECT_HUMAN = True

# if COLLECT_HUMAN = True, specify whether to use "Prolific" or "MTURK"
HUMAN_SOURCE = "MTURK"

alpha = 0.1  # desired error level for confidence interval
burnin_steps = 5  # we collect the first warmup samples (initial set) to initialize sampling rule

n_batches = 2

tau = 0.1  # parameter for mixing with uniform sampling for increased stability
n_human = 15 # budget on number of human annotations (including burnin_steps)

N = 100 # corpus size, or the size of the random subset of the corpus that will be annotated with an LLM
random_state = 42

### Set parameters for LLM annotation

In [5]:
prompt = """Is the following text polite? Output either A or B. Output a letter only.
A) Polite
B) Impolite

Text: """

#category numerically coded as 1
positive_class = 'Polite'

mapping_categories = {
"A": "Polite",
"B": "Impolite"
}

model = "gpt-4o-mini-2024-07-18"
sleep = 0.1
temperature = 0

### Set parameters for human annotation

In [6]:
## Task parameters
task_title = "Is the following text polite?"
annotation_instruction = "Is the following text polite?"
task_description = "Please annotate the politeness of the provided text. This is a real-time study where we're hoping to get immediate annotations."
categories = [
    "Polite",
    "Impolite"
]

PROLIFIC_API_KEY = ""

## Additional Prolific settings
BASE_URL = "https://api.prolific.com/api/v1"
HEADERS = {
    "Authorization": f"Token {PROLIFIC_API_KEY}",
    "Content-Type": "application/json",
}
reward = 80
estimated_time = 1
maximum_allowed_time = 30 # How long a single annotator can take
BATCH_TIMEOUT = 30 # How long we'll wait for the resonses (in minutes) before moving on (replaces not collected annotations with np.nan)

## Additional MTURK settings
task_reward = '0.8' 
minimum_approval_rate = 99 # >98% reccommended
minimum_tasks_approved = 0 # >5000 reccommended
annotation_instructions = {"question": task_description,
    "options": set(categories)}

### Step 1: Load the texts and collect LLM annotations

#### Texts will be annotated for their politeness. We will showcase the estimation of the target statistic:
Prevalence of the politeness $\mathrm{mean}(Y)$, i.e., the fraction of texts in the corpus that are polite

In [7]:
start_time_data_collection = time.time()

# Load the text corpus; we need two columns: texts and the text-based feature we will use for inference
# Text-based feature used for inference in this example is the presence of hedging, stored in "Feature_3" column
text_based_feature = 'Feature_3'

df = pd.read_csv('data/politeness_dataset.csv')[['Text',text_based_feature]]
df = df.sample(n = N, random_state = random_state)

In [8]:
n = len(df)
data = pd.DataFrame()
data['human'] = [np.nan]*(n)
data['llm'] = [np.nan]*(n)
data['llm_conf'] = [np.nan]*(n)
data['X'] = df[text_based_feature].values
data['text'] = df['Text'].values

In [9]:
if COLLECT_LLM:
    #collect annotations
    llm_annotated = annotate_texts_with_llm(texts = data['text'].values,
                                       model = model,
                                       prompt = prompt,
                                       mapping_categories = mapping_categories,
                                       sleep = sleep,
                                       temperature = temperature,
                                       OPENAI_API_KEY = OPENAI_API_KEY)
    #collect verbalized confidence
    llm_annotated_with_confidence = collect_llm_confidence(sample_texts = llm_annotated,
                                       model = model,
                                       sleep = sleep,
                                       temperature = temperature,
                                       OPENAI_API_KEY = OPENAI_API_KEY)

    
    data['llm'] = llm_annotated_with_confidence['LLM_annotation'].apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
    data['llm_conf'] = llm_annotated_with_confidence['confidence_in_prediction']
else:
    #load the extisting annotations we already collected
    df['Prediction_gpt-4o'] = pd.read_csv('data/politeness_dataset.csv')['Prediction_gpt-4o'].sample(n = N, random_state = random_state).values
    df['Confidence_gpt-4o'] = pd.read_csv('data/politeness_dataset.csv')['Confidence_gpt-4o'].sample(n = N, random_state = random_state).values
    data['llm'] = df['Prediction_gpt-4o'].apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
    data['llm_conf'] = df['Confidence_gpt-4o'].values

Collecting LLM annotations:   0%|          | 0/100 [00:00<?, ? annotation/s]

Collecting LLM confidence:   0%|          | 0/100 [00:00<?, ? annotation/s]

### Step 2: Collect human annotations for warmup samples (initial set)

In [10]:
if COLLECT_HUMAN:
    texts_to_annotate = list(data.loc[:burnin_steps-1,'text'].values)

    if HUMAN_SOURCE == "Prolific":
    
        # create Qualtrics annotation interface and get annotation task URLs
        survey_links = create_and_activate_surveys(
            texts_to_annotate=texts_to_annotate,
            categories=categories,
            annotation_instruction=annotation_instruction,
            QUALTRICS_API_URL=QUALTRICS_API_URL,
            QUALTRICS_API_KEY=QUALTRICS_API_KEY)
        
        # run the Prolific annotation pipeline
        annotations = run_prolific_annotation_pipeline(
            survey_links=list(survey_links.values()),
            name_prefix=task_title,
            description=task_description,
            reward = reward,  
            estimated_time=estimated_time,
            max_time = maximum_allowed_time,
            HEADERS = HEADERS,
            BASE_URL = BASE_URL,
            QUALTRICS_API_URL = QUALTRICS_API_URL,
            QUALTRICS_API_KEY = QUALTRICS_API_KEY,
            BATCH_TIMEOUT = BATCH_TIMEOUT
        )

    if HUMAN_SOURCE == "MTURK":
        annotations = run_mturk_annotation_pipeline(pd.DataFrame(texts_to_annotate, columns=['Text']),
                                            annotation_instructions = annotation_instructions,
                                            task_title = task_title,
                                            task_description = task_description,
                                            task_reward = task_reward,
                                            minimum_approval_rate = minimum_approval_rate,
                                            minimum_tasks_approved = minimum_tasks_approved,
                                            aws_access_key_id = AWS_ACCESS_KEY_ID,
                                            aws_secret_access_key = AWS_SECRET_ACCESS_KEY)
        
    data.loc[:burnin_steps-1,'human'] = pd.Series(annotations).apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
else:
    #load the extisting annotations we already collected
    df['Prediction_human'] = pd.read_csv('data/politeness_dataset.csv').sample(n = N, random_state = random_state)['Prediction_human'].values
    df = df['Prediction_human'].reset_index()

    #initialize the first warmup samples (initial set)
    data.loc[:burnin_steps-1,'human'] = df['Prediction_human'].values[:burnin_steps]

Publishing anntotation tasks:


  0%|          | 0/5 [00:00<?, ?it/s]

You can preview the hits here:
https://workersandbox.mturk.com/mturk/preview?groupId=3SFRD5IA6L8IW7JM82P5P2CPSXPFB1


Completed HITs:   0%|          | 0/5 [00:00<?, ?HIT/s]

All HITs are completed.


### Step 2: Initialize the sampling rule

In [11]:
confidence = data['llm_conf'].to_numpy().reshape((n,1))
confidence_burnin = confidence[:burnin_steps]
H = data['human'].to_numpy()
H_burnin = H[:burnin_steps]
Hhat = data['llm'].to_numpy()
Hhat_burnin = Hhat[:burnin_steps]
SP = np.zeros(n)
SD = np.zeros(n)
SP[:burnin_steps] = np.ones(burnin_steps)
SD[:burnin_steps] = np.ones(burnin_steps)
sampling_rule = train_sampling_rule(confidence_burnin, (H_burnin - Hhat_burnin)**2) # trains XGBoost model
sampling_probs_unnormed = np.clip(np.sqrt(sampling_rule_predict(sampling_rule, confidence)), 1e-4, 1)
avg_sampling_probs = np.mean(sampling_probs_unnormed)
frac_human_adjusted = (n_human - burnin_steps)/(n - burnin_steps) # remove warmup samples (initial set) from available budget

### Step 3: In batches, strategically sample texts for human annotation

In [12]:
batch_size = (n - burnin_steps)//n_batches
for b in range(n_batches):
    if b < (n_batches - 1):
        batch_inds = np.array(range(burnin_steps + b*batch_size, burnin_steps + (b+1)*batch_size))
    else:
        batch_inds = np.array(range(burnin_steps + b*batch_size, n))
        
    sampling_probs = sampling_probs_unnormed[batch_inds]/avg_sampling_probs*frac_human_adjusted
    sampling_probs = np.clip((1-tau)*sampling_probs + tau*frac_human_adjusted, 0, 1)

    if np.isnan(sampling_probs).all():
        print(f"Training the model failed at batch {b+1}/{n_batches}... Quitting.")
        break
        
    labeling_decisions = bernoulli.rvs(sampling_probs)
    indices_to_label = batch_inds[np.where(labeling_decisions)]

    print()
    print(f"Collecting batch {b+1}/{n_batches}...")
    if COLLECT_HUMAN:
        texts_to_annotate = list(data.loc[indices_to_label,'text'].values)

        if HUMAN_SOURCE == "Prolific":
            # create Qualtrics annotation interface and get annotation task URLs
            survey_links = create_and_activate_surveys(
                texts_to_annotate=texts_to_annotate,
                categories=categories,
                annotation_instruction=annotation_instruction,
                QUALTRICS_API_URL=QUALTRICS_API_URL,
                QUALTRICS_API_KEY=QUALTRICS_API_KEY)
            
            # run the Prolific annotation pipeline
            annotations = run_prolific_annotation_pipeline(
                survey_links=list(survey_links.values()),
                name_prefix=task_title,
                description=task_description,
                reward = reward,  
                estimated_time=estimated_time,
                max_time = maximum_allowed_time,
                HEADERS = HEADERS,
                BASE_URL = BASE_URL,
                QUALTRICS_API_URL = QUALTRICS_API_URL,
                QUALTRICS_API_KEY = QUALTRICS_API_KEY,
                BATCH_TIMEOUT = BATCH_TIMEOUT
            )

        if HUMAN_SOURCE == "MTURK":
            annotations = run_mturk_annotation_pipeline(pd.DataFrame(texts_to_annotate, columns=['Text']),
                                            annotation_instructions = annotation_instructions,
                                            task_title = task_title,
                                            task_description = task_description,
                                            task_reward = task_reward,
                                            minimum_approval_rate = minimum_approval_rate,
                                            minimum_tasks_approved = minimum_tasks_approved,
                                            aws_access_key_id = AWS_ACCESS_KEY_ID,
                                            aws_secret_access_key = AWS_SECRET_ACCESS_KEY)

        H[indices_to_label] = pd.Series(annotations).apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values
    else: 
        H[indices_to_label] = df['Prediction_human'].iloc[list(indices_to_label)]
        print(f"Collecting {len(df['Prediction_human'].iloc[list(indices_to_label)])} human annotations.") 

    collected_inds = np.where(labeling_decisions)

    SP[batch_inds] = sampling_probs
    SD[batch_inds] = labeling_decisions
    
    if b < (n_batches - 1):
        sampling_rule = train_sampling_rule(confidence[collected_inds], (H[collected_inds] - Hhat[collected_inds])**2)
        sampling_probs_unnormed = np.clip(np.sqrt(sampling_rule_predict(sampling_rule, confidence)), 1e-4, 1)
        avg_sampling_probs = np.mean(sampling_probs_unnormed)


Collecting batch 1/2...
Publishing anntotation tasks:


  0%|          | 0/4 [00:00<?, ?it/s]

You can preview the hits here:
https://workersandbox.mturk.com/mturk/preview?groupId=3SFRD5IA6L8IW7JM82P5P2CPSXPFB1


Completed HITs:   0%|          | 0/4 [00:00<?, ?HIT/s]

All HITs are completed.

Collecting batch 2/2...
Publishing anntotation tasks:


  0%|          | 0/5 [00:00<?, ?it/s]

You can preview the hits here:
https://workersandbox.mturk.com/mturk/preview?groupId=3SFRD5IA6L8IW7JM82P5P2CPSXPFB1


Completed HITs:   0%|          | 0/5 [00:00<?, ?HIT/s]

All HITs are completed.


In [13]:
data.loc[list(collected_inds[0]),'human'] = H[list(collected_inds)][0]
data['sampling_probs'] = SP
data['sampling_decisions'] = SD
print(f"{len(data.dropna(subset = ['human']))} human datapoints collected in total.")

14 human datapoints collected in total.


In [14]:
data.loc[~data['human'].isna()]

Unnamed: 0,human,llm,llm_conf,X,text,sampling_probs,sampling_decisions
0,1.0,1,0.8,1,thank you for that. Do you think it is possibl...,1.0,1.0
1,0.0,0,0.2,0,Notability is an issue of common sense: you're...,1.0,1.0
2,0.0,0,0.3,1,Too vague - explain what you expect the conten...,1.0,1.0
3,0.0,0,0.2,0,I am the third party; <url> was the first to d...,1.0,1.0
4,1.0,1,0.9,0,Interesting problem. What have you tried so f...,1.0,1.0
10,1.0,1,0.9,1,Thanks for approving my request. Should I remo...,0.105263,1.0
39,1.0,1,0.7,0,What do you mean by MobileMail? The Mail app o...,0.105263,1.0
42,0.0,0,0.7,0,Please see our ongoing discussion <url>. Is th...,0.105263,1.0
51,0.0,0,0.7,0,"You really want off here, dont you? I've alrea...",0.105263,1.0
58,0.0,0,0.3,1,So who exactly is going to *teach* this topic ...,0.105263,1.0


### Step 4: Compute the estimate and confidence interval
#### We showcase estimation of $\mathrm{mean}(Y)$: prevalence of the politeness, i.e., the fraction of texts in the corpus that are polite

In [15]:
#define the estimator function
#mask for valid labels and specify how to use weights

def mean_estimator(y, weights):
    y, weights = y[~np.isnan(y)], weights[~np.isnan(y)]
    return np.sum(y * weights) / np.sum(weights)

In [16]:
estimate, (lower_bound, upper_bound) = confidence_driven_inference(
    estimator = mean_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    sampling_probs =  data['sampling_probs'].values,
    sampling_decisions = data['sampling_decisions'].values,
    alpha = alpha)

print("CDI estimate of the target statistic (mean(Y): prevalence of politeness):")
print('point estimate:',estimate.round(4))
print('confidence intervals:', lower_bound.round(4), upper_bound.round(4))

CDI estimate of the target statistic (mean(Y): prevalence of politeness):
point estimate: 0.5793
confidence intervals: 0.4963 0.6624


In [17]:
print("Ground truth mean(Y) estimate (if we had access to human annotations on the full text corpus):")
print(np.mean(pd.read_csv('data/politeness_dataset.csv').sample(n = N, random_state = random_state)['Prediction_human'].values))

Ground truth mean(Y) estimate (if we had access to human annotations on the full text corpus):
0.58


In [18]:
data.head()

Unnamed: 0,human,llm,llm_conf,X,text,sampling_probs,sampling_decisions
0,1.0,1,0.8,1,thank you for that. Do you think it is possibl...,1.0,1.0
1,0.0,0,0.2,0,Notability is an issue of common sense: you're...,1.0,1.0
2,0.0,0,0.3,1,Too vague - explain what you expect the conten...,1.0,1.0
3,0.0,0,0.2,0,I am the third party; <url> was the first to d...,1.0,1.0
4,1.0,1,0.9,0,Interesting problem. What have you tried so f...,1.0,1.0


In [19]:
elapsed_minutes = (time.time() - start_time_data_collection) / 60
print(f"Total data collection time: {elapsed_minutes:.2f} minutes.")

Total data collection time: 6.09 minutes.
