### Import libraries

In [1]:
import numpy as np
from scipy.stats import norm, bernoulli
import pprint
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm
import time
from ppi_py.utils import bootstrap
import re
import openai
import requests
import json
from datetime import datetime, timezone
import time
import zipfile
import io
import random
from sklearn.linear_model import LogisticRegression
%load_ext autoreload
%autoreload 2
    
from utils import llms, qualtrics, prolific, mturk, inference
from utils.llms import annotate_texts_with_llm, collect_llm_confidence, get_llm_annotations
from utils.qualtrics import create_and_activate_surveys
from utils.prolific import run_prolific_annotation_pipeline
from utils.mturk import run_mturk_annotation_pipeline
from utils.inference import train_sampling_rule, sampling_rule_predict, confidence_driven_inference, collect_initial_human_annotations, run_adaptive_sampling

### Setup credentials

In [11]:
def load_credentials(file_path="credentials.txt"):
    credentials = {}
    with open(file_path, "r") as f:
        for line in f:
            if '=' in line:
                key, value = line.strip().split('=', 1)
                credentials[key.strip()] = value.strip()
    return credentials

# Load credentials, or put your key here in plain text
creds = load_credentials()
AWS_ACCESS_KEY_ID = creds.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = creds.get("AWS_SECRET_ACCESS_KEY")
OPENAI_API_KEY = creds.get("OPENAI_API_KEY")
QUALTRICS_API_KEY = creds.get("QUALTRICS_API_KEY")
QUALTRICS_API_URL = creds.get("QUALTRICS_API_URL")
PROLIFIC_API_KEY = creds.get("PROLIFIC_API_KEY")

### Set parameters for Confidence-Driven Inference (CDI)

In [3]:
# if true, we collect LLM annotations and human annotations from scratch, if false, load pre-collected ones
COLLECT_LLM = False
COLLECT_HUMAN = False

# if COLLECT_HUMAN = True, specify whether to use "Prolific" or "MTURK"
HUMAN_SOURCE = "MTURK"

alpha = 0.1  # desired error level for confidence interval
burnin_steps = 5  # we collect the first burnin_steps points to initialize sampling rule

n_batches = 2

tau = 0.1  # parameter for mixing with uniform sampling for increased stability
n_human = 15 # budget on number of human annotations (including burnin_steps)

N = 100 # corpus size, or the size of the random subset of the corpus that will be annotated with an LLM
random_state = 42

#define the estimator function
#mask for valid labels and specify how to use weights

def mean_estimator(y, weights):
    y, weights = y[~np.isnan(y)], weights[~np.isnan(y)]
    return np.sum(y * weights) / np.sum(weights)

# Load the text corpus; we need two columns: texts and the text-based feature we will use for inference
# Text-based feature used for inference in this example is the presence of hedging, stored in "Feature_3" column
text_based_feature = 'Feature_3'

df = pd.read_csv('data/politeness_dataset.csv')[['Text',text_based_feature]]
df = df.sample(n = N, random_state = random_state)

### Set parameters for LLM annotation

In [4]:
prompt = """Is the following text polite? Output either A or B. Output a letter only.
A) Polite
B) Impolite

Text: """

#category numerically coded as 1
positive_class = 'Polite'

mapping_categories = {
"A": "Polite",
"B": "Impolite"
}

model = "gpt-4o-mini-2024-07-18"
sleep = 0.1
temperature = 0

### Set parameters for human annotation

In [5]:
## Task parameters
task_title = "Is the following text polite?"
annotation_instruction = "Is the following text polite?"
task_description = "Please annotate the politeness of the provided text. This is a real-time study where we're hoping to get immediate annotations."
categories = [
    "Polite",
    "Impolite"
]

## Additional Prolific settings
BASE_URL = "https://api.prolific.com/api/v1"
HEADERS = {
    "Authorization": f"Token {PROLIFIC_API_KEY}",
    "Content-Type": "application/json",
}
reward = 80
estimated_time = 1
maximum_allowed_time = 30 # How long a single annotator can take
BATCH_TIMEOUT = 30 # How long we'll wait for the resonses (in minutes) before moving on (replaces not collected annotations with np.nan)

## Additional MTURK settings
task_reward = '0.8' 
minimum_approval_rate = 99 # >98% reccommended
minimum_tasks_approved = 0 # >5000 reccommended
annotation_instructions = {"question": task_description,
    "options": set(categories)}

human_annotation_parameters = {
    "categories": categories,
    "annotation_instruction": annotation_instruction,
    "annotation_instructions": annotation_instructions,
    "QUALTRICS_API_URL": QUALTRICS_API_URL,
    "QUALTRICS_API_KEY": QUALTRICS_API_KEY,
    "task_title": task_title,
    "task_description": task_description,
    "reward": reward,
    "estimated_time": estimated_time,
    "maximum_allowed_time": maximum_allowed_time,
    "HEADERS": HEADERS,
    "BASE_URL": BASE_URL,
    "BATCH_TIMEOUT": BATCH_TIMEOUT,
    "task_reward": task_reward,
    "minimum_approval_rate": minimum_approval_rate,
    "minimum_tasks_approved": minimum_tasks_approved,
    "AWS_ACCESS_KEY_ID": AWS_ACCESS_KEY_ID,
    "AWS_SECRET_ACCESS_KEY": AWS_SECRET_ACCESS_KEY,
    "positive_class": positive_class
}

### Step 1: Collect LLM annotations for all the texts

In [6]:
data = get_llm_annotations(df=df,
    text_based_feature=text_based_feature,
    COLLECT_LLM=COLLECT_LLM,
    model=model,
    prompt=prompt,
    mapping_categories=mapping_categories,
    sleep=sleep,
    temperature=temperature,
    OPENAI_API_KEY=OPENAI_API_KEY,
    positive_class=positive_class,
    N=N,
    random_state=random_state
)

### Step 2: Collect warmup human annotations (initial set)

In [7]:
data = collect_initial_human_annotations(
    data=data,
    df=df,
    COLLECT_HUMAN=COLLECT_HUMAN,
    HUMAN_SOURCE=HUMAN_SOURCE,
    burnin_steps=burnin_steps,
    N=N,
    random_state=random_state,
    human_annotation_parameters = human_annotation_parameters)

### Step 3: Strategically collect human annotations

In [8]:
data = run_adaptive_sampling(
    data=data,
    df=df,
    n=len(df),
    burnin_steps=burnin_steps,
    n_human=n_human,
    n_batches=n_batches,
    tau=tau,
    COLLECT_HUMAN=COLLECT_HUMAN,
    HUMAN_SOURCE=HUMAN_SOURCE,
    human_annotation_parameters = human_annotation_parameters)


Collecting batch 1/2...
Collecting 2 human annotations.

Collecting batch 2/2...
Collecting 4 human annotations.
11 human datapoints collected in total.


### Step 4: Compute the CDI estimate and confidence intervals

We showcase estimation of $\mathrm{mean}(Y)$: prevalence of the politeness, i.e., the fraction of texts in the corpus that are polite

In [9]:
estimate, (lower_bound, upper_bound) = confidence_driven_inference(
    estimator = mean_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    sampling_probs =  data['sampling_probs'].values,
    sampling_decisions = data['sampling_decisions'].values,
    alpha = alpha)

print("CDI estimate of the target statistic (mean(Y): prevalence of politeness):")
print('point estimate:',estimate.round(4))
print('confidence intervals:', lower_bound.round(4), upper_bound.round(4))

print("Ground truth mean(Y) estimate (if we had access to human annotations on the full text corpus):")
print(np.mean(pd.read_csv('data/politeness_dataset.csv').sample(n = N, random_state = random_state)['Prediction_human'].values))

CDI estimate of the target statistic (mean(Y): prevalence of politeness):
point estimate: 0.6238
confidence intervals: 0.2732 0.8975
Ground truth mean(Y) estimate (if we had access to human annotations on the full text corpus):
0.58
