In [1]:
# Minimal, one-sample evaluation with Claude + LangChain, that you should try
# Goal of evaluation: show BOTH reference-free (helpfulness) and reference-aware (correctness) scoring.

from langchain_anthropic import ChatAnthropic
from langchain.evaluation import load_evaluator

# 1) Pick a stable, versioned Claude model 
llm = ChatAnthropic(model="claude-sonnet-4-5-20250929", temperature=0)

# 2) One gold sample keeps the snippet readable for the paper.
item = {
    "question": "Define TTFT.",
    "reference": "Time-to-first-token: latency from request start to first token."
}

# 3) System under test: a deterministic call to Claude.
def predict(q: str) -> str:
    return llm.invoke([("system", "Answer concisely."), ("human", q)]).content

pred = predict(item["question"])

# 4) Evaluators:
#    - "criteria".. reference-free (UX-style qualities like helpfulness).
#    - "labeled_criteria".. reference-aware (fact checks vs. the reference).
crit_eval = load_evaluator(
    "criteria",
    llm=llm,
    criteria={"helpfulness": "Is the answer practically useful and clear?"}
)
lab_eval = load_evaluator(
    "labeled_criteria",
    llm=llm,
    criteria={"correctness": "Is the answer correct given the reference?"}
)

# 5) Get scores (+ rationales)
res_help = crit_eval.evaluate_strings(prediction=pred, input=item["question"])
res_corr = lab_eval.evaluate_strings(
    prediction=pred, input=item["question"], reference=item["reference"]
)

help_score = res_help.get("score")
help_note  = res_help.get("reasoning") or res_help.get("explanation")
corr_score = res_corr.get("score")
corr_note  = res_corr.get("reasoning") or res_corr.get("explanation")

# 6) Printout 
print(
    f"\n{'='*64}\n"
    f"Q: {item['question']}\n\n"
    f"Prediction:\n{pred}\n\n"
    f"Helpfulness: {help_score} — {help_note}\n"
    f"Correctness: {corr_score} — {corr_note}\n"
)




Q: Define TTFT.

Prediction:
**TTFT** stands for **Time To First Token**.

It's a performance metric that measures the latency between when a user submits a request to a language model (LLM) or AI system and when the first token of the response is generated and returned to the user.

TTFT is important because:
- It affects perceived responsiveness and user experience
- Lower TTFT means users see output starting sooner
- It's particularly critical for streaming responses where users want immediate feedback

TTFT is influenced by factors like model size, prompt length, server load, and infrastructure efficiency.

Helpfulness: 1 — Let me analyze whether this submission meets the helpfulness criterion by evaluating if it is practically useful and clear.

**Step-by-step reasoning:**

1. **Does it define the term clearly?**
   - Yes, it explicitly states "TTFT stands for Time To First Token"
   - The definition is straightforward and unambiguous

2. **Does it explain what the term means in 