# Two-Agent LLM Classification for Societal Text Data

This notebook implements a two-agent LLM text classification pipeline where a **high-level controller agent** adaptively designs and refines the classification prompt and evaluation plan, and a **worker agent** labels individual texts row-by-row based on the classification prompt. Both agents are powered by the OpenAI API, and this notebook allows users to choose which OpenAI models to use for the controller and the worker.

The pipeline is demonstrated on:
- a Chinese news dataset with labels Challenge vs Source (stance toward foreign news outlets)
- the **EZ-STANCE** dataset for English tweet stance detection(favor, against, none)



## 1. Setup and Configuration

In [None]:
import os
import json
import random
import requests
import time
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import io
import contextlib
import traceback
import getpass

In [None]:
#data path configuration

from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/Agentic Labeling System/'

In [None]:
#safely entering parameters to use Cloudflare proxy for calling LLM API

if not os.environ.get("CF_WORKER_URL"):
    os.environ["CF_WORKER_URL"] = getpass.getpass("Enter Cloudflare worker URL (input hidden): ")

if not os.environ.get("CF_PROXY_TOKEN"):
    os.environ["CF_PROXY_TOKEN"] = getpass.getpass("Enter Cloudflare proxy token (input hidden): ")

WORKER_URL = os.environ["CF_WORKER_URL"]
PROXY_TOKEN = os.environ["CF_PROXY_TOKEN"]

## 2. Two-Agent Labeling Pipeline Implementation

In [None]:
###---Helper Functions (API Call)---###

#call OpenAI chat completions through the Cloudflare worker
def openai_chat_via_cf(
    model,            #the GPT model
    messages,         #input messages
    temperature=0,
    max_retries=3,    #the maximum number of retries when encountering transident errors
    retry_backoff_seconds=5, #base delay time for backoff between retries
    **extra_params,   #additional parameters to send long in the JSON
):
    #base payload for OpenAI Chat API
    payload = {
        "model": model,
        "messages": messages,
    }

    #for GPT 5 family, temperature and token limit parameters are not allowed
    gpt5_family = {"gpt-5", "gpt-5-mini", "gpt-5-nano"}
    if model not in gpt5_family:
        payload["temperature"] = temperature
        payload.update(extra_params)
    else:
        allowed_extra_params = {}
        for k, v in extra_params.items():
            if k in {"max_tokens", "max_output_tokens", "max_completion_tokens"}:
                continue
            if k == "temperature":
                continue
            allowed_extra_params[k] = v
        payload.update(allowed_extra_params)

    #retry loop
    last_error = None #the last error seen
    for attempt in range(1, max_retries + 1):
        try:
            #for each attempt, send an HTTP POST to WORKER_URL
            resp = requests.post(
                WORKER_URL,
                headers={
                    "x-proxy-token": PROXY_TOKEN,
                    "Content-Type": "application/json",
                },
                json=payload,
                timeout=120,
            )

            #handling HTTP errors
            if resp.status_code in (429,) or 500 <= resp.status_code < 600:
              #retry only on transient HTTP codes (429 or 5xx)
                print(f"[openai_chat_via_cf] Transient error (status {resp.status_code}) on attempt {attempt}/{max_retries}")
                try:
                    print("  Payload of error:", resp.json())
                except Exception:
                    print("  Raw error text:", resp.text)
                last_error = requests.HTTPError(f"Status {resp.status_code}")
                if attempt < max_retries:
                    sleep_for = retry_backoff_seconds * attempt
                    print(f"  Sleeping {sleep_for} seconds before retrying...")
                    time.sleep(sleep_for)
                    continue
                else:
                    resp.raise_for_status()

            #for other HTTP codes, raise if error
            resp.raise_for_status()
            return resp.json()

        #for requests-level issue, log the exception
        except requests.RequestException as e:
            print(f"[openai_chat_via_cf] Request exception on attempt {attempt}/{max_retries}: {e}")
            #store the expection
            last_error = e
            if attempt < max_retries:
                sleep_for = retry_backoff_seconds * attempt
                print(f"  Sleeping {sleep_for} seconds before retrying...")
                time.sleep(sleep_for)
                continue
            else:
                raise

    #defensive fallback: in case exit the loop without returning or raising
    if last_error is not None:
        #reraise the last captured error
        raise last_error


In [None]:
###---Helper Functions (Basic Utilities)---###

#Given a dataframe with ground-truth labels and LLM predicted labels,
#build a clean metrics dictionary that can log or feed back to the agent.
def build_metrics_summary(df,         #dataframe
                          label_col,  #the column of ground-truth labels
                          pred_col,   #the column of LLM predicted labels
                          labels_list #the list of candidate labels
                          ):

    y_true = df[label_col].values
    y_pred = df[pred_col].values

    #compute overall accuracy across all rows
    acc = accuracy_score(y_true, y_pred)
    #compute per-label precision, recall, F1 score, number of true examples for each label
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=labels_list,
        zero_division=0  #avoid crashes when a label has no predicted positives
    )

    #build per-label dictionary
    per_label_dic = {}
    for lab, p, r, f, s in zip(labels_list, precision, recall, f1, support):
        per_label_dic[lab] = {
            "precision": float(p),
            "recall": float(r),
            "f1": float(f),
            "support": int(s),
        }
    #average of per-label F1 scores (unweighted)
    macro_f1 = float(sum(f1) / len(f1)) if len(f1) > 0 else 0

    summary = {
        "overall": {
            "num_eval_rows": int(len(df)),
            "accuracy": float(acc),
            "macro_f1": macro_f1,
        },
        "per_label": per_label_dic,
    }
    return summary

#if use more than the text column as input for LLM clasification,
#use the user-customized function to turn the columns into messages
def model_input_from_row(row,             #one row of the dataframe
                         TEXT_COL,        #main text column
                         row_to_text_fn): #an optional function provided to build a custom string from the row

    if row_to_text_fn is not None:
        return str(row_to_text_fn(row))
    return str(row[TEXT_COL])

In [None]:
###---Helper Functions (Tools the agent can request)---###

#give the agent global information about the dataset
def tool_get_summary(df_train, labels, label_counts):
    return {
        "num_rows": int(len(df_train)), #size of the training set
        "labels": labels,               #list of label names
        "label_counts": label_counts,   #the number of rows for each label
    }

#give a small preview of the first n rows for the agent
def tool_get_head(df_train, TEXT_COL, LABEL_COL, n=10, row_to_text_fn=None):
    n = min(int(n), len(df_train))   #avoid asking for more rows than exist
    subset = df_train.head(n).copy() #take the first n rows

    #convert the texts and labels into a dictionary
    records = subset[[TEXT_COL, LABEL_COL]].to_dict(orient="records")

    #add combined information if extra columns are used as LLM classification input
    if row_to_text_fn is not None:
        for i, (_, row) in enumerate(subset.iterrows()):
            records[i]["model_input"] = str(row_to_text_fn(row))

    return records

#give N random examples for each label
def tool_sample_per_label(df_train, TEXT_COL, LABEL_COL, per_label_n, random_state, row_to_text_fn=None):
    result = {}
    for lab, n in per_label_n.items():
        n = int(n)
        if n <= 0:
            continue
        subset = df_train[df_train[LABEL_COL] == lab]
        if len(subset) == 0:
            continue
        sample_n = min(n, len(subset))
        sampled = subset.sample(n=sample_n, random_state=random_state).copy()

        records = sampled[[TEXT_COL, LABEL_COL]].to_dict(orient="records")

        if row_to_text_fn is not None:
            for i, (_, row) in enumerate(sampled.iterrows()):
                records[i]["model_input"] = str(row_to_text_fn(row))

        result[lab] = records

    return result

In [None]:
###---Helper Functions (Safe sandbox for executing agent-provided Python code)---###
#Important Note: This function is intended for experimental use only.
 #- While it restricts access to unsafe operations and limits available built-in functions,
 # it is not guaranteed to be fully secure.
 # I'm using it here with caution.

#define the set of built-in functions that are safe to execute
SAFE_BUILTINS = {
    "print": print,
    "len": len,
    "range": range,
    "min": min,
    "max": max,
    "sum": sum,
    "sorted": sorted,
    "enumerate": enumerate,
    "list": list,
    "dict": dict,
    "set": set,
    "tuple": tuple,
    "any": any,
    "all": all,
    "abs": abs,
    "round": round,
    "str": str,
    "int": int,
    "float": float,
}

#define the set of functions that are not allowed to execute
DISALLOWED_TOKENS = [
    "import ",
    "import\t",
    "import\n",
    "from ",
    "open(",
    "os.",
    "sys.",
    "subprocess",
    "socket",
    "requests",
    "http.",
    "urllib",
]

#sandbox executor for LLM agent-written code
#It executes agent-provided Python code in a restricted environment with:
 #- no imports allowed
 #- no access to os, sys, requests, network, etc.
 #- only safe builtins
 #- only three objects exposed (df_train, TEXT_COL, LABEL_COL).
def run_python_view_sandbox(
    code_str,
    df_train,
    TEXT_COL,
    LABEL_COL,
    max_output_chars=4000,
):
    #Before executing anything,
    #if any of the forbidden functions appears inside the code string,
    #immediately reject it.
    for token in DISALLOWED_TOKENS:
        if token in code_str:
            return {
                "status": "error",
                "error": f"Disallowed token detected in code: {token.strip()}",
                "stdout": "",
                "stderr": "",
            }

    #prepare restricted global environment
    env = {
        "__builtins__": SAFE_BUILTINS,
        "df_train": df_train,
        "TEXT_COL": TEXT_COL,
        "LABEL_COL": LABEL_COL,
    }

    #in-memory string buffers
    stdout_buf = io.StringIO() #capture anything that the code prints to standard output
    stderr_buf = io.StringIO() #capture errors or anything printed to standard error

    try:
        #temporarily redirects Python’s global sys.stdout and sys.stderr to StringIO buffers for the duration of the with block
        with contextlib.redirect_stdout(stdout_buf), contextlib.redirect_stderr(stderr_buf):
            exec(code_str, env, {}) #runs the agent-written Python code string
        status = "ok"
    except Exception:
        #return the error details as part of the function result
        traceback.print_exc(file=stderr_buf) #capture full traceback into stderr
        status = "error"

    #After execution,
    #return the entire captured content of stdout as a string
    stdout_val = stdout_buf.getvalue()
    #return the error output / traceback as a string
    stderr_val = stderr_buf.getvalue()

    #truncate outputs to avoid excessively long outputs
    #(keeps string size manageable for logs and for feeding back to the agent)
    if len(stdout_val) > max_output_chars:
        stdout_val = stdout_val[:max_output_chars] + "\n...[stdout truncated]"

    if len(stderr_val) > max_output_chars:
        stderr_val = stderr_val[:max_output_chars] + "\n...[stderr truncated]"

    #return a dictionary that summarizes the execution result
    return {
        "status": status,
        "stdout": stdout_val,
        "stderr": stderr_val,
    }

In [None]:
###---Helper Function (Agent decides how to look at data)---###

def agent_choose_data_view(
    user_description_for_agent, #user message
    TEXT_COL,                   #the column of main texts
    LABEL_COL,                  #the column of ground-truth labels
    labels,                     #list of candidate labels
    label_counts,               #mapping from each label to its count
    MODEL_AGENT,                #LLM model used as the agent
):
    tools_description = f"""
You have access to the following tools over the labeled dataset.
Each row has at least two columns:
- '{TEXT_COL}': the main text field to be classified.
- '{LABEL_COL}': the human-provided (gold standard) label for that row.

Available tools:

1. get_summary
   - Description: returns a JSON summary with:
       - num_rows (int)
       - labels (list of label names)
       - label_counts (mapping label -> count)
   - Params: {{ }}

2. get_head
   - Description: returns the first N rows as a list of objects with keys '{TEXT_COL}' and '{LABEL_COL}'.
   - Params: {{"n": <int, number of rows, e.g. 10 or 20>}}

3. sample_per_label
   - Description: returns, for each label, a sample of up to N[label] rows, as an object:
       {{
         "<label_name>": [{{"{TEXT_COL}": ..., "{LABEL_COL}": ...}}, ...],
         ...
       }}
   - Params: {{"per_label_n": {{"<label_name>": <int>, ... }}}}
     For example: {{"per_label_n": {{"<label_A>": 5, "<label_B>": 10}}}}

4. python_view
   - Description: executes a SHORT Python snippet to inspect df_train.
     Objects you can use:
       - df_train: a pandas DataFrame with all training rows.
       - TEXT_COL: the name of the text column (string).
       - LABEL_COL: the name of the label column (string).
     The environment is RESTRICTED:
       - No network access.
       - No imports (import/from are blocked).
       - No os/sys/requests/etc.
       - Only safe builtins and pandas methods on df_train.
   - Params: {{"code": "<your Python code as a single string>"}}
"""

    system_msg = (
        "You are an expert in qualitative content analysis and prompt engineering. "
        "You will first decide which view of the labeled data to inspect, "
        "then later design a classification prompt and validation plan."
    )

    user_msg = f"""
The user describes the labeled dataset as:

{user_description_for_agent}

The label column is '{LABEL_COL}', and the labels currently present in the data are:
{labels}

Approximate label counts:
{label_counts}

You may request ONE tool call to inspect the labeled data before designing a prompt.
{tools_description}

Return ONLY a JSON object of the form:

{{
  "tool": "get_summary" | "get_head" | "sample_per_label" | "python_view",
  "params": {{ ... }}
}}

For python_view, put your code in params.code, for example:
{{
  "tool": "python_view",
  "params": {{
    "code": "print('Shape:', df_train.shape)"
  }}
}}

Be mindful of token limits: do not request extremely large samples.
Prefer a modest sample size (for example, 10–30 rows or small per-label samples) for initial inspection.
Keep python_view code SHORT and focused on inspecting df_train.
"""

    resp = openai_chat_via_cf(
        model=MODEL_AGENT,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=TEMPERATURE_AGENT,
    )

    content = resp["choices"][0]["message"]["content"]
    print("=== Agent data-view request (raw) ===")
    print(content)
    cmd = json.loads(content)
    return cmd

In [None]:
###---Helper Function (Execute the data viewing code the agent requests)---###

def execute_data_view_command(
    df_train,
    TEXT_COL,
    LABEL_COL,
    labels,
    label_counts,
    cmd,           #the parsed command from the agent
    RANDOM_SEED,
    row_to_text_fn=None,
):
    tool = cmd.get("tool")
    params = cmd.get("params", {}) or {}

    if tool == "get_summary":
        result = tool_get_summary(df_train, labels, label_counts)

    elif tool == "get_head":
        n = params.get("n", 10)
        result = tool_get_head(
            df_train,
            TEXT_COL,
            LABEL_COL,
            n=n,
            row_to_text_fn=row_to_text_fn,
        )

    elif tool == "sample_per_label":
        per_label_n = params.get("per_label_n", {})
        result = tool_sample_per_label(
            df_train,
            TEXT_COL,
            LABEL_COL,
            per_label_n,
            random_state=RANDOM_SEED,
            row_to_text_fn=row_to_text_fn,
        )

    elif tool == "python_view":
        code_str = params.get("code", "")
        #check whether code_str is actually a string
        if not isinstance(code_str, str) or not code_str.strip():
            raise ValueError("python_view tool requires a non-empty string in params['code'].")
        print("\n=== Agent-proposed python_view code ===")
        print(code_str)
        result = run_python_view_sandbox(
            code_str=code_str,
            df_train=df_train,
            TEXT_COL=TEXT_COL,
            LABEL_COL=LABEL_COL,
        )

    else:
        raise ValueError(f"Unknown tool: {tool}")

    return tool, params, result

In [None]:
###---Helper Function (Agent builds initial prompt and validation plan)---###

def agent_build_prompt_and_plan(
    user_description_for_agent,
    TEXT_COL,
    LABEL_COL,
    labels,
    label_counts,
    tool_name,
    tool_params,
    tool_result,
    total_examples,
    MODEL_AGENT,
    has_row_to_text_fn, #boolean flag indicating whether a custom row_to_text_fn is used to construct input text
):

    #convert results from data viewing into a JSON string
    tool_result_str = json.dumps(tool_result, ensure_ascii=False)

    #multi-line string with label frequencies
    #letting the agent know how balanced/imbalanced the labels are
    label_counts_text = "\n".join(
        f"- {lab}: {count} rows" for lab, count in label_counts.items()
    )

    if not has_row_to_text_fn:
        classifier_input_desc = (
            f"The classifier model will receive as input the raw value from the '{TEXT_COL}' column."
        )
        input_requirement_sentence = (
            f"- State that the input will be a single text value taken from the '{TEXT_COL}' column."
        )
    else:
        classifier_input_desc = (
            "The classifier model will receive as input a single combined text string "
            "constructed from multiple columns in each row "
            "The main text column is "
            f"'{TEXT_COL}'."
        )
        input_requirement_sentence = (
            "- State that the input will be a single combined text string derived from the row "
            f"(including '{TEXT_COL}' and any additional context the user has chosen to include)."
        )

    system_msg = (
        "You are an expert in qualitative content analysis and LLM prompt design. "
        "You will now design a classification system prompt and a validation batch plan."
    )

    user_msg = f"""
The user provided this description of their labeled dataset:

{user_description_for_agent}

Columns:
- Text column: '{TEXT_COL}'
- Label column: '{LABEL_COL}'

{classifier_input_desc}

The set of labels present in the data is:
{labels}

The full dataset has {total_examples} rows.
Label frequencies in the full dataset are:
{label_counts_text}

Earlier, you requested a tool call to inspect the data:
- Tool name: {tool_name}
- Parameters: {json.dumps(tool_params)}

Here is the JSON result of that tool call:
{tool_result_str}

Your tasks now:

1. Draft a clear, concise **system prompt** (in English) for a classifier model that will be used on one row at a time.
   The system prompt should:
   - Instruct the classification task in general terms, using the user's description.
   {input_requirement_sentence}
   - State that the model must assign exactly ONE label from the list: {labels}.
   - Require strict JSON output of the form:
       {{"label": "<one of: {', '.join(labels)}>"}}
   - Be independent of any specific row values (i.e., no references to particular examples by content).

2. Propose how many examples of each label to include in the **first validation batch**.
   - You MUST respect the true label frequencies above, not exceeding the label's maximum available count.
   - Be mindful of cost: the total validation batch should be informative but not unnecessarily large.

Return ONLY a JSON object of the form:

{{
  "classification_system_prompt": "<English system prompt for the classifier>",
  "per_label_batch_sizes": {{
    "<label_1>": <int>,
    "<label_2>": <int>,
    ...
  }},
  "justification": "<brief English justification of your plan>"
}}

Make sure that:
- The keys in per_label_batch_sizes match the labels exactly: {labels}.
- For every label L, per_label_batch_sizes[L] is an integer between 0 and label_counts[L], inclusive.
"""

    resp = openai_chat_via_cf(
        model=MODEL_AGENT,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=TEMPERATURE_AGENT,
    )

    content = resp["choices"][0]["message"]["content"]
    print("\n=== Agent prompt + validation plan (raw) ===")
    print(content)

    plan = json.loads(content)
    return plan

In [None]:
###---Helper Function (validation batch building)---###

#for the first validation round
def build_validation_batch(df,
                           label_col,
                           labels_list,
                           per_label_batch_sizes,
                           random_state):
    dfs = []
    #for each candidate label in the df,
    #randomly sample the number of rows the agent requests
    for lab in labels_list:
        #look up how many examples were requested for this label
        requested = int(per_label_batch_sizes.get(lab, 0))
        if requested <= 0:
            continue  #defensive fallback
        subset = df[df[label_col] == lab]
        if len(subset) == 0:
            continue  #defensive fallback
        sample_n = min(requested, len(subset))
        sampled = subset.sample(n=sample_n, random_state=random_state)
        dfs.append(sampled)

    if not dfs:
        raise ValueError("No data selected for validation batch. Check agent plan.")

    #build the final validation batch dataframe
    batch_df_local = (
        #stack all sampled dataframes vertically into a single dataframe
        pd.concat(dfs, axis=0)
        #shuffle all rows in that concatenated dataframe
        .sample(frac=1.0, random_state=random_state)
        .reset_index(drop=True)
    )
    return batch_df_local

#more flexible validation batch sampling for 2nd+ validation rounds
def build_validation_batch_generic(
    df,
    evaluated_ids,
    label_col,
    labels_list,
    per_label_batch_sizes_val,
    sample_from_val,
    random_state,
):
    #only sampling from the remaining unused rows
    if sample_from_val == "remaining_only":
        pool_df = df[~df["row_id"].isin(evaluated_ids)]
    #allow resampling from the full df
    else:
        pool_df = df

    if pool_df.empty:
        raise ValueError("No rows available to sample from.")

    dfs_local = []
    for lab in labels_list:
        requested = int(per_label_batch_sizes_val.get(lab, 0))
        if requested <= 0:
            continue
        subset = pool_df[pool_df[label_col] == lab]
        if len(subset) == 0:
            continue
        sample_n = min(requested, len(subset))
        sampled = subset.sample(n=sample_n, random_state=random_state)
        dfs_local.append(sampled)

    if not dfs_local:
        raise ValueError("No data selected for new validation batch.")
    batch_df_local = (
        pd.concat(dfs_local, axis=0)
        .sample(frac=1.0, random_state=random_state)
        .reset_index(drop=True)
    )
    return batch_df_local

In [None]:
###---Helper Function (classification worker)---###

def classify_one(
    row,
    system_prompt,
    labels_list,
    majority_label_val,
    MODEL_WORKER,
    TEXT_COL,
    row_to_text_fn,
    temperature=0,
):

    #construct the exact text that will be fed to the worker LLM
    text_value = model_input_from_row(row, TEXT_COL, row_to_text_fn)
    has_row_to_text_fn = row_to_text_fn is not None

    if not has_row_to_text_fn:
        input_source_line = f"Input text (raw value from column '{TEXT_COL}'):"
    else:
        input_source_line = (
            "Input text: a single combined string built from the row "
            f"(including '{TEXT_COL}' and any other context columns the user chose)."
        )

    #build a small instruction in the user message
    user_msg = f"""
You will be given a text (and possible additional contexts).

Your task:
- Based on the system instructions, decide which label best applies to this text.
- You must choose exactly ONE label from this set:
  {labels_list}

{input_source_line}
{text_value}

Output format:
Return ONLY a single JSON object with one field "label", whose value is exactly one of:
{labels_list}

For example:
  {{"label": "{labels_list[0]}"}}   (if that is the best label)
No extra text, no explanation.
"""

    #call LLM API
    resp = openai_chat_via_cf(
        model=MODEL_WORKER,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_msg},
        ],
        temperature=temperature,
    )

    #extract the worker's output from the response
    content = resp["choices"][0]["message"]["content"]
    try:
        obj = json.loads(content)
        label = obj.get("label", "").strip()
        if label not in labels_list:
            raise ValueError("Invalid label")
    except Exception:
        label = None
        for lab in labels_list:
            if lab in content:
                label = lab
                break
        if label is None:
            label = majority_label_val
    return label

#loop over a batch of rows, call classify_one for each
def classify_batch(
    df_batch,
    system_prompt,
    labels_list,
    majority_label_val,
    worker_config,
    MODEL_WORKER,
    TEXT_COL,
    row_to_text_fn,
):

    temp = float(worker_config.get("temperature", TEMPERATURE_WORKER))
    preds = [
        classify_one(
            row,
            system_prompt,
            labels_list,
            majority_label_val,
            MODEL_WORKER,
            TEXT_COL,
            row_to_text_fn,
            temperature=temp,
        )
        for _, row in df_batch.iterrows()
    ]

    #attach predictions as a new column, return the labeled batch
    df_batch_local = df_batch.copy()
    df_batch_local["pred_label"] = preds
    return df_batch_local


In [None]:
###---Helper Function (Agent decides the next step)---###

def agent_decide_next_step(
    user_description_for_agent,
    TEXT_COL,
    LABEL_COL,
    labels_list,
    label_counts_val,
    used_counts_val,
    remaining_counts_val,
    classification_system_prompt_val,
    metrics_summary_val,
    mis_examples_val,
    total_examples,
    num_evaluated_val,
    MODEL_AGENT,
    has_row_to_text_fn,
):

    #convert classification metrics and misclassified examples to strings
    metrics_text = json.dumps(metrics_summary_val, ensure_ascii=False)
    mis_text = json.dumps(mis_examples_val, ensure_ascii=False)

    #basic summaries of the data
    label_counts_text = "\n".join(
        f"- {lab}: {count} total rows" for lab, count in label_counts_val.items()
    )
    used_counts_text = "\n".join(
        f"- {lab}: {used_counts_val.get(lab, 0)} already evaluated" for lab in labels_list
    )
    remaining_counts_text = "\n".join(
        f"- {lab}: {remaining_counts_val.get(lab, 0)} remaining" for lab in labels_list
    )

    if not has_row_to_text_fn:
        input_summary_for_agent = (
            f"The classifier currently receives as input only the value of '{TEXT_COL}'."
        )
    else:
        input_summary_for_agent = (
            "The classifier currently receives as input a combined string built from each row "
            f"(including '{TEXT_COL}' and possibly other columns that the user has chosen)."
        )

    system_msg = (
        "You are an expert in qualitative content analysis and prompt design. "
        "You are acting as an agent that can iteratively improve or accept a prompt."
    )

    user_msg = f"""
The user provided this description of their labeled dataset:

{user_description_for_agent}

Columns:
- Text column: '{TEXT_COL}'
- Label column: '{LABEL_COL}'

{input_summary_for_agent}

Labels present in the data:
{labels_list}

Label frequencies in the full dataset:
{label_counts_text}

Evaluation status so far:
- Total rows evaluated so far: {num_evaluated_val} out of {total_examples}
- Per-label evaluated counts:
{used_counts_text}
- Per-label remaining counts:
{remaining_counts_text}

Current system prompt for the classifier:

--- BEGIN CURRENT SYSTEM PROMPT ---
{classification_system_prompt_val}
--- END CURRENT SYSTEM PROMPT ---

Evaluation summary over all evaluated rows (JSON):
{metrics_text}

Some misclassified examples (if any), truncated:
{mis_text}

You must now decide what to do next. You have three high-level options:

1. accept_prompt
   - Keep the current system prompt as the final prompt.
   - Do not request further validation in this step.

2. validate_more
   - Keep the current system prompt.
   - Request another validation round.
   - You will specify:
       (a) validation_mode:
           - "reuse_last_batch": re-evaluate the SAME rows as the last validation batch.
           - "new_subset": evaluate a NEW subset of rows.
       (b) If validation_mode = "new_subset", also specify sample_from:
           - "remaining_only": sample only from rows not yet evaluated.
           - "full_dataset": sample from the entire dataset (you may re-use some rows).
       (c) per_label_batch_sizes: for each label, how many rows to include in this next validation round.

3. revise_prompt
   - Propose an improved system prompt (in English).
   - Also request another validation round, with the same controls as in (2):
       validation_mode, sample_from, per_label_batch_sizes.

Cost & sample-size considerations (important):
- Each additional evaluated row increases cost roughly linearly.
- Often, relatively small but well-chosen batches (for example, dozens of examples per label, not hundreds) are enough to see whether a prompt revision helped.
- When performance is clearly low (e.g., low macro-F1), prioritize:
    * understanding errors and revising the prompt, and
    * testing revised prompts on modest, targeted batches,
  instead of repeatedly requesting much larger batches.

You may also optionally specify worker_config, such as temperature, for the classifier model.

Return ONLY a JSON object of the form:

{{
  "decision": "accept_prompt" | "validate_more" | "revise_prompt",
  "validation_mode": "reuse_last_batch" | "new_subset",
  "sample_from": "remaining_only" | "full_dataset",
  "worker_config": {{
    "temperature": <float between 0.0 and 1.0, or omit to use default>
  }},
  "new_system_prompt": "<string or null>",
  "per_label_batch_sizes": {{
    "<label_1>": <int>,
    "<label_2>": <int>,
    ...
  }},
  "justification": "<brief English justification>"
}}

Conventions:
- If you choose "accept_prompt", set "validation_mode" to "reuse_last_batch",
  "sample_from" can be null or "remaining_only", worker_config can be empty, and
  per_label_batch_sizes can be empty. The prompt will be considered final.
- For "validate_more" or "revise_prompt":
  - validation_mode and sample_from MUST be specified.
  - Keys of per_label_batch_sizes must match the labels exactly: {labels_list}.
  - If sample_from = "remaining_only", you should respect remaining_counts[label]
    when choosing batch sizes.
"""

    resp = openai_chat_via_cf(
        model=MODEL_AGENT,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=TEMPERATURE_AGENT,
    )

    content = resp["choices"][0]["message"]["content"]
    print("\n=== Agent next-step decision (raw) ===")
    print(content)

    decision_plan_val = json.loads(content)
    return decision_plan_val


## 3. Final Pipeline Wrap-Up Function

In [None]:
###---Helper Function (final wrap-up)---###

def run_agentic_labeling_pipeline(
    df_train,
    df_test,
    TEXT_COL="text",
    LABEL_COL="human_label",
    MODEL_AGENT="gpt-5.1",
    MODEL_WORKER="gpt-5-mini",
    RANDOM_SEED=2025,
    user_description_for_agent="",
    max_rounds=3,
    row_to_text_fn=None,   #optionally, build input string from multiple columns
):
    random.seed(RANDOM_SEED)

    # === Stage 0: prepare training data ===
    #make a copy of the training df so the original object won't be accidentally modified
    df_train = df_train.copy()
    df_train = df_train.reset_index().rename(columns={"index": "row_id"})
    #ensure labels are clean
    df_train[LABEL_COL] = df_train[LABEL_COL].astype(str).str.strip()

    labels = sorted(df_train[LABEL_COL].unique().tolist())
    label_counts = df_train[LABEL_COL].value_counts().to_dict()
    majority_label = max(label_counts.items(), key=lambda kv: kv[1])[0]

    print("=== Stage 0: Prepare training data ===")
    print(f"  Labels found: {labels}")
    print(f"  Label counts: {label_counts}")
    print(f"  Total train rows: {len(df_train)}\n")

    has_row_to_text_fn = row_to_text_fn is not None

    # === Stage 1: agent chooses data view ===
    print("=== Stage 1: Agent chooses data view on training set ===")
    view_cmd = agent_choose_data_view(
        user_description_for_agent,
        TEXT_COL,
        LABEL_COL,
        labels,
        label_counts,
        MODEL_AGENT,
    )

    tool_name, tool_params, tool_result = execute_data_view_command(
        df_train,
        TEXT_COL,
        LABEL_COL,
        labels,
        label_counts,
        view_cmd,
        RANDOM_SEED,
        row_to_text_fn=row_to_text_fn,
    )

    print("\n=== Tool executed ===")
    print("  Tool:", tool_name)
    print("  Params:", tool_params)

    if tool_name == "python_view":
        print("  Status:", tool_result.get("status"))
        stdout_full = tool_result.get("stdout") or ""
        stderr_full = tool_result.get("stderr") or ""
        stdout_preview = stdout_full[:1500]
        stderr_preview = stderr_full[:1500]

        if stdout_preview:
            print("  Stdout preview:\n", stdout_preview + ("..." if len(stdout_full) > 1500 else ""))
        if stderr_preview.strip():
            print("\n  Stderr preview:\n", stderr_preview + ("..." if len(stderr_full) > 1500 else ""))
    else:
        preview_str = json.dumps(tool_result, ensure_ascii=False)[:1500]
        full_str = json.dumps(tool_result, ensure_ascii=False)
        print("  Result preview:", preview_str + ("..." if len(full_str) > 1500 else ""))

    # === Stage 2: agent builds initial prompt + plan ===
    print("\n=== Stage 2: Agent builds initial prompt + validation plan ===")
    plan = agent_build_prompt_and_plan(
        user_description_for_agent=user_description_for_agent,
        TEXT_COL=TEXT_COL,
        LABEL_COL=LABEL_COL,
        labels=labels,
        label_counts=label_counts,
        tool_name=tool_name,
        tool_params=tool_params,
        tool_result=tool_result,
        total_examples=len(df_train),
        MODEL_AGENT=MODEL_AGENT,
        has_row_to_text_fn=has_row_to_text_fn,
    )

    classification_system_prompt = plan["classification_system_prompt"]
    per_label_batch_sizes = plan["per_label_batch_sizes"]

    print("\n=== Agent-chosen per_label_batch_sizes ===")
    print(per_label_batch_sizes)
    print("\n=== Initial classification system prompt ===\n")
    print(classification_system_prompt)

    # === Stage 3: build initial validation batch ===
    print("\n=== Stage 3: Build initial validation batch ===")
    for lab in labels:
        requested = int(per_label_batch_sizes.get(lab, 0))
        available = label_counts.get(lab, 0)
        if requested > available:
            print(
                f"  Warning: agent requested {requested} rows for label '{lab}' "
                f"but only {available} exist; capping at {available}."
            )

    batch_df = build_validation_batch(
        df_train,
        label_col=LABEL_COL,
        labels_list=labels,
        per_label_batch_sizes=per_label_batch_sizes,
        random_state=RANDOM_SEED,
    )

    print(
        f"  Validation batch size: {len(batch_df)} "
        + ", ".join(f"{lab}: {(batch_df[LABEL_COL] == lab).sum()}" for lab in labels)
    )

    # === Stage 4: first validation round ===
    print("\n=== Stage 4: First validation round ===")
    worker_config = {}
    batch_df = classify_batch(
        batch_df,
        classification_system_prompt,
        labels,
        majority_label,
        worker_config,
        MODEL_WORKER,
        TEXT_COL,
        row_to_text_fn,
    )

    eval_df = batch_df.copy()
    evaluated_ids = set(eval_df["row_id"].tolist())
    num_evaluated_so_far = len(eval_df)

    metrics_summary = build_metrics_summary(eval_df, LABEL_COL, "pred_label", labels)
    used_counts = eval_df[LABEL_COL].value_counts().to_dict()
    remaining_counts = {
        lab: label_counts.get(lab, 0) - used_counts.get(lab, 0) for lab in labels
    }

    mis_all = eval_df[eval_df[LABEL_COL] != eval_df["pred_label"]]
    mis_examples_for_agent = [
        {
            "text": row[TEXT_COL],
            "true_label": row[LABEL_COL],
            "pred_label": row["pred_label"],
        }
        for _, row in mis_all.head(10).iterrows()
    ]

    print("\n--- Classification report on initial validation batch ---")
    print(
        classification_report(
            eval_df[LABEL_COL],
            eval_df["pred_label"],
            labels=labels,
            digits=3,
        )
    )

    # === Stage 5: iterative agent loop ===
    print("\n=== Stage 5: Iterative agent loop ===")
    final_prompt = None
    last_batch_df = batch_df

    for round_idx in range(1, max_rounds + 1):
        print(f"\n----- Agent iteration {round_idx} -----")
        decision_plan = agent_decide_next_step(
            user_description_for_agent=user_description_for_agent,
            TEXT_COL=TEXT_COL,
            LABEL_COL=LABEL_COL,
            labels_list=labels,
            label_counts_val=label_counts,
            used_counts_val=used_counts,
            remaining_counts_val=remaining_counts,
            classification_system_prompt_val=classification_system_prompt,
            metrics_summary_val=metrics_summary,
            mis_examples_val=mis_examples_for_agent,
            total_examples=len(df_train),
            num_evaluated_val=num_evaluated_so_far,
            MODEL_AGENT=MODEL_AGENT,
            has_row_to_text_fn=has_row_to_text_fn,
        )

        print("\n=== Parsed agent decision plan ===")
        print(decision_plan)

        decision = decision_plan["decision"]
        validation_mode = decision_plan.get("validation_mode", "new_subset")
        sample_from = decision_plan.get("sample_from", "remaining_only")
        worker_config = decision_plan.get("worker_config", {}) or {}
        new_system_prompt = decision_plan.get("new_system_prompt")
        next_per_label_batch_sizes = decision_plan.get("per_label_batch_sizes", {}) or {}

        if decision == "accept_prompt":
            final_prompt = classification_system_prompt
            print("\nAgent chose to ACCEPT the current prompt as final.")
            break

        if decision == "revise_prompt" and new_system_prompt:
            classification_system_prompt = new_system_prompt
            print("\nAgent chose to REVISE the prompt.")
            print("Revised system prompt:\n", classification_system_prompt)
        else:
            print("\nAgent chose to VALIDATE MORE with the same prompt.")

        print("  validation_mode:", validation_mode)
        print("  sample_from:", sample_from)
        print("  worker_config:", worker_config)
        print("  Next per_label_batch_sizes:", next_per_label_batch_sizes)

        if validation_mode == "reuse_last_batch":
            new_last_batch = classify_batch(
                last_batch_df,
                classification_system_prompt,
                labels,
                majority_label,
                worker_config,
                MODEL_WORKER,
                TEXT_COL,
                row_to_text_fn,
            )
            last_batch_ids = last_batch_df["row_id"].tolist()
            eval_df = eval_df.copy()
            mask = eval_df["row_id"].isin(last_batch_ids)
            eval_df.loc[mask, "pred_label"] = new_last_batch["pred_label"].values
            last_batch_df = new_last_batch
        else:
            next_batch_df = build_validation_batch_generic(
                df_train,
                evaluated_ids=evaluated_ids,
                label_col=LABEL_COL,
                labels_list=labels,
                per_label_batch_sizes_val=next_per_label_batch_sizes,
                sample_from_val=sample_from,
                random_state=RANDOM_SEED + round_idx,
            )
            next_batch_df = classify_batch(
                next_batch_df,
                classification_system_prompt,
                labels,
                majority_label,
                worker_config,
                MODEL_WORKER,
                TEXT_COL,
                row_to_text_fn,
            )

            eval_df = pd.concat([eval_df, next_batch_df], ignore_index=True)
            evaluated_ids.update(next_batch_df["row_id"].tolist())
            last_batch_df = next_batch_df

        num_evaluated_so_far = len(eval_df)
        metrics_summary = build_metrics_summary(eval_df, LABEL_COL, "pred_label", labels)
        used_counts = eval_df[LABEL_COL].value_counts().to_dict()
        remaining_counts = {
            lab: label_counts.get(lab, 0) - used_counts.get(lab, 0) for lab in labels
        }

        mis_all = eval_df[eval_df[LABEL_COL] != eval_df["pred_label"]]
        mis_examples_for_agent = [
            {
                "text": row[TEXT_COL],
                "true_label": row[LABEL_COL],
                "pred_label": row["pred_label"],
            }
            for _, row in mis_all.head(10).iterrows()
        ]

        print("\n--- Updated classification report on all evaluated rows ---")
        print(
            classification_report(
                eval_df[LABEL_COL],
                eval_df["pred_label"],
                labels=labels,
                digits=3,
            )
        )

    if final_prompt is None:
        final_prompt = classification_system_prompt
        print("\nMax rounds reached; using last prompt as final.")

    # === Stage 6: final evaluation on TEST set ===
    print("\n=== Stage 6: Final evaluation on TEST set ===")
    df_test = df_test.copy()
    df_test[LABEL_COL] = df_test[LABEL_COL].astype(str).str.strip()

    print(f"  Test size: {len(df_test)}")
    print("  Test label counts:")
    print(df_test[LABEL_COL].value_counts())

    test_preds = [
        classify_one(
            row,
            final_prompt,
            labels,
            majority_label,
            MODEL_WORKER,
            TEXT_COL,
            row_to_text_fn,
        )
        for _, row in df_test.iterrows()
    ]
    df_test["pred_label"] = test_preds

    print("\n--- Classification report on TEST set ---")
    print(
        classification_report(
            df_test[LABEL_COL],
            df_test["pred_label"],
            labels=labels,
            digits=3,
        )
    )

    mis_test = df_test[df_test[LABEL_COL] != df_test["pred_label"]]
    print("\nSome misclassified TEST examples:")
    for _, row in mis_test.head(10).iterrows():
        print("----")
        print(f"Text from '{TEXT_COL}':", row[TEXT_COL])
        print("Gold:", row[LABEL_COL], " Pred:", row["pred_label"])

    metrics_test = build_metrics_summary(df_test, LABEL_COL, "pred_label", labels)

    return {
        "final_prompt": final_prompt,
        "train_eval_df": eval_df,
        "test_df": df_test,
        "train_metrics": metrics_summary,
        "test_metrics": metrics_test,
    }


## 4. Experiment 1: Chinese News Articles (Challenge vs. Source)
In this experiment, the two-agent pipeline is applied to classify whether a Chinese news article excerpt, which mentions a foreign news outlet, is **challenging** (i.e., questioning the credibility, criticizing) the foreign news outlet, or citing it as an authoritative **source**.

Data Source:
Waight, Hannah et al. (Mar. 18, 2025). “The decade-long growth of government-authored news media in China.” In: Proceedings of the National Academy of Sciences 122.11. Publisher: Proceedings of the National Academy of Sciences, e2408260122. DOI: 10.1073/pnas.2408260122. URL: https://www.pnas.org/doi/10.1073/pnas.2408260122.

Due to legal restrictions, this dataset is not made public.

In [None]:
###---User Setting---###
CSV_PATH_train = data_path + "data/human_labeled_sample.csv"
CSV_PATH_test  = data_path + "data/human_labeled_validation_sample.csv"

df_train = pd.read_csv(CSV_PATH_train)
df_test  = pd.read_csv(CSV_PATH_test)

TEXT_COL = "text"          # default
LABEL_COL = "human_label"  # default
MODEL_AGENT = "gpt-5.1"
MODEL_WORKER = "gpt-5-mini"
RANDOM_SEED = 2025
TEMPERATURE_AGENT = 0.3
TEMPERATURE_WORKER = 0

user_description_for_agent = """This is a dataset of Chinese news article excerpts.
There are two labels:
Challenge: This Chinese news article excerpt explicitly challenges, criticizes, or disputes the foreign newspaper it mentions.
Source: The editor mainly uses the foreign newspaper as a source of information or reporting."""


In [None]:
result = run_agentic_labeling_pipeline(
    df_train=df_train,
    df_test=df_test,
    TEXT_COL=TEXT_COL,
    LABEL_COL=LABEL_COL,
    MODEL_AGENT=MODEL_AGENT,
    MODEL_WORKER=MODEL_WORKER,
    RANDOM_SEED=RANDOM_SEED,
    user_description_for_agent=user_description_for_agent,
    max_rounds=3,
    row_to_text_fn=None
)

## 5. Experiment 2: EZ-STANCE Stance Detection

The two agents perform the task of labeling the **stance** expressed in English tweets toward a given target. The goal is to determine the author's position or attitude (i.e., whether they *favor*, *oppose*, or remain *neutral* toward the subject mentioned).

Open-Source Data:
Chenye Zhao and Cornelia Caragea. 2024. EZ-STANCE: A Large Dataset for English Zero-Shot Stance Detection. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 15697-15714, Bangkok, Thailand. Association for Computational Linguistics.


In [None]:
EZ_TRAIN_PATH = data_path + "EZ-STANCE data/ezstance/subtaskA/mixed/raw_train_all_onecol.csv"
EZ_TEST_PATH  = data_path + "EZ-STANCE data/ezstance/subtaskA/mixed/raw_test_all_onecol.csv"

In [None]:
#load raw EZ-STANCE files
df_ezstance_train = pd.read_csv(EZ_TRAIN_PATH)
df_ezstance_test = pd.read_csv(EZ_TEST_PATH)

print("Train columns:", df_ezstance_train.columns.tolist())
print("Test columns :", df_ezstance_test.columns.tolist())
df_ezstance_train.head()

TEST_N = 500
df_ezstance_test = (
    df_ezstance_test
    .sample(
        n=min(TEST_N, len(df_ezstance_test)),
        random_state=RANDOM_SEED
    )
    .reset_index(drop=True)
)

print("Subsampled test size:", len(df_ezstance_test))

In [None]:
#combining the target and post columns as the input
def ezstance_row_to_text(row):
    return f"Target: {row['Target 1']}\nPost: {row['Ori Text']}"

In [None]:
result_ez = run_agentic_labeling_pipeline(
    df_train=df_ezstance_train,
    df_test=df_ezstance_test,
    TEXT_COL="Ori Text",
    LABEL_COL="Stance 1",
    MODEL_AGENT="gpt-5.1",
    MODEL_WORKER="gpt-5-mini",
    RANDOM_SEED=2025,
    user_description_for_agent="""
This is a dataset of English social media posts with stance labels toward a target.
Labels: favor, against, none.
""",
    max_rounds=3,
    row_to_text_fn=ezstance_row_to_text
)