### This is a non-adaptive version of the tutorial, where we collect all the human annotations in a single batch
- Texts are randomly selected for human annotations, according to the budget
- We then leverage the verbalized confidence scores and limited human annotations for valid statistical inference
- We assume human annotations, llm annotations, and llm confidence have already been collected

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import time
from ppi_py.utils import bootstrap
import re
import random
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
%load_ext autoreload
%autoreload 2
    
from utils import inference
from utils.inference import confidence_driven_inference

### Set your file parameters and needed columns

1. The file path
2. Human labels (textual labels such as "positive" or "negative", or "polite" or "impolite")
3. LLM labels (textual labels such as "positive" or "negative", or "polite" or "impolite")
4. A text-based feature (0 or 1 indicating presence or absence of a textual feature)
5. What class will be considered as positive (the class we are focusing on)

In [2]:
### Replace with your path and your column names

# The file path
dataset = 'data/politeness_dataset_human_incomplete.csv'

# The name of the column with human labels
human_labels = 'Prediction_human'

# The name of the column with LLM labels
llm_labels = 'Prediction_gpt-4o'

# The name of the column with a text-based feature
text_based_feature = 'Feature_3'

# What class will be considered as positive (the class we are focusing on)
positive_class = 'Polite'

### Set parameters for Confidence-Driven Inference (CDI)

In [3]:
alpha = 0.1  # desired error level for confidence interval

### Step 1: Load the texts and LLM annotations

#### We will showcase the estimation of two target statistics:
1. $\mathrm{mean}(H)$, i.e., the fraction of texts in the corpus that are labeled as the positive class
2. The impact of text-based feature ($X$), on the positive class annotation ($H$), estimated with a logistic regression

In [4]:
N = len(pd.read_csv(dataset))
data = pd.DataFrame()

#load the text-based feature
data['X'] = pd.read_csv(dataset)[text_based_feature].values

#load the extisting LLM annotations we already collected
data['llm'] = pd.read_csv(dataset)[llm_labels].apply(lambda x: 1 if x.lower()==positive_class.lower() else 0).values

### Step 2: Load human annotations (in a single batch)

In [5]:
#load the extisting human annotations we already collected
data['human'] = pd.read_csv(dataset)[[human_labels]].values

n_human = sum(~data['human'].isna())
print(f"{len(data.dropna(subset = ['human']))} human datapoints collected in total.")

data['sampling_decisions'] = (~data['human'].isna()).astype(int)

2000 human datapoints collected in total.


### Step 3: Compute the estimate and confidence interval

In [6]:
#define the estimator function
#mask for valid labels and specify how to use weights

def mean_estimator(y, weights):
    y, weights = y[~np.isnan(y)], weights[~np.isnan(y)]
    return np.sum(y * weights) / np.sum(weights)

def log_reg_estimator(X, y, weights):
    X, y, weights = X[~np.isnan(y)], y[~np.isnan(y)], weights[~np.isnan(y)]
    return LogisticRegression(solver="liblinear").fit(X, y, sample_weight=weights).coef_[0, 0]

#### $\mathrm{mean}(H)$ estimation

In [7]:
estimate, (lower_bound, upper_bound) = confidence_driven_inference(
    estimator = mean_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    sampling_probs =  np.ones(len(data))/len(data),
    sampling_decisions = data['sampling_decisions'].values,
    alpha = alpha)

print("CDI estimate of the target statistic (mean(H)):")
print('point estimate:',estimate.round(4))
print('confidence intervals:', lower_bound.round(4), upper_bound.round(4))

CDI estimate of the target statistic (mean(H)):
point estimate: 0.5133
confidence intervals: 0.4954 0.5303


#### $\beta$ estimation

In [8]:
estimate, (lower_bound, upper_bound) = confidence_driven_inference(
    estimator = log_reg_estimator,
    Y = data['human'].values,
    Yhat = data['llm'].values,
    X = data['X'].values.reshape(-1, 1),
    sampling_probs =  np.ones(len(data))/len(data),
    sampling_decisions = data['sampling_decisions'].values,
    alpha = alpha)

print("CDI estimate of the target statistic (β: effect of X on H):")
print('point estimate:',estimate.round(4))
print('confidence intervals:', lower_bound.round(4), upper_bound.round(4))

CDI estimate of the target statistic (β: effect of X on H):
point estimate: 0.4433
confidence intervals: 0.2734 0.5991
