# Chapter 4: LLM-as-a-Judge

Hands-on implementation of LLM-based evaluation systems.

In [None]:
import numpy as np
from scipy.stats import spearmanr, kendalltau
from sklearn.metrics import cohen_kappa_score

## Measuring Evaluator Agreement

Before trusting any evaluator (human or LLM), we need statistical tools to measure agreement.

In [None]:
# Example from the book: comparing human scores with metric scores
human_scores = [4, 2, 5, 3, 1, 4, 3, 5, 2, 1]
metric_scores = [0.7, 0.3, 0.9, 0.5, 0.1, 0.8, 0.4, 0.85, 0.25, 0.15]

# Spearman's rank correlation
corr, p_value = spearmanr(human_scores, metric_scores)
print(f"Spearman's ρ: {corr:.3f}")
print(f"p-value: {p_value:.4f}")

# Bootstrap confidence intervals
n_bootstraps = 1000
bootstrap_corrs = []
for _ in range(n_bootstraps):
    indices = np.random.choice(len(human_scores), len(human_scores), replace=True)
    resampled_human = [human_scores[i] for i in indices]
    resampled_metric = [metric_scores[i] for i in indices]
    boot_corr, _ = spearmanr(resampled_human, resampled_metric)
    bootstrap_corrs.append(boot_corr)

lower = np.percentile(bootstrap_corrs, 2.5)
upper = np.percentile(bootstrap_corrs, 97.5)
print(f"95% CI: [{lower:.3f}, {upper:.3f}]")

## Kendall's Tau: Pairwise Agreement

Kendall's τ counts concordant vs discordant pairs. A τ of 0.8 means 80% of pairs agree on ordering.

In [None]:
# Example from the book
human_ranks = [1, 2, 3, 4, 5]
metric_ranks = [1, 3, 2, 5, 4]

tau, p_value = kendalltau(human_ranks, metric_ranks)
print(f"Kendall's τ: {tau:.3f}")
print(f"p-value: {p_value:.4f}")

# Count concordant and discordant pairs manually
n = len(human_ranks)
concordant = discordant = 0
for i in range(n):
    for j in range(i + 1, n):
        h_diff = human_ranks[i] - human_ranks[j]
        m_diff = metric_ranks[i] - metric_ranks[j]
        if h_diff * m_diff > 0:
            concordant += 1
        elif h_diff * m_diff < 0:
            discordant += 1

print(f"\nConcordant pairs: {concordant}")
print(f"Discordant pairs: {discordant}")
print(f"Manual τ: {(concordant - discordant) / (concordant + discordant):.3f}")

## Cohen's Kappa: Categorical Agreement

For binary/categorical judgments (pass/fail, safe/unsafe), Cohen's κ corrects for chance agreement.

In [None]:
# Example: Two raters labeling responses as acceptable (1) or not (0)
rater1 = [1, 1, 0, 1, 1, 0, 1, 0, 1, 1]
rater2 = [1, 0, 0, 1, 1, 1, 1, 0, 1, 1]

kappa = cohen_kappa_score(rater1, rater2)

# Compute manually to understand
observed_agreement = sum(r1 == r2 for r1, r2 in zip(rater1, rater2)) / len(rater1)

# Expected agreement by chance
p1_yes = sum(rater1) / len(rater1)
p2_yes = sum(rater2) / len(rater2)
expected_agreement = p1_yes * p2_yes + (1 - p1_yes) * (1 - p2_yes)

manual_kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement)

print(f"Observed agreement: {observed_agreement:.0%}")
print(f"Expected (chance) agreement: {expected_agreement:.0%}")
print(f"Cohen's κ: {kappa:.3f}")
print(f"\nInterpretation (Landis & Koch):")
print("  0.81-1.00: Almost perfect")
print("  0.61-0.80: Substantial")
print("  0.41-0.60: Moderate")
print("  0.21-0.40: Fair")
print("  0.00-0.20: Slight")

## G-Eval: Systematic LLM-based Evaluation

G-Eval's three components: structured prompts, auto-generated CoT, and probability-weighted scoring.

In [None]:
# G-Eval prompt template for coherence evaluation
GEVAL_COHERENCE_PROMPT = """You will be given one summary written for a news article.

Your task is to rate the summary on one metric.

Please make sure you read and understand these instructions carefully.

Evaluation Criteria:

Coherence (1-5) - the collective quality of all sentences. The summary should 
be well-structured and well-organized. The summary should not just be a heap 
of related information, but should build from sentence to sentence into a 
coherent body of information about a topic.

Evaluation Steps:

1. Read the news article carefully and identify the main topic and key points.
2. Read the summary and compare it to the news article. Check if the summary 
   covers the main topic and key points, and if it presents them in a clear 
   and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest 
   and 5 is the highest, based on the Evaluation Criteria.

Source Text:
{document}

Summary:
{summary}

Evaluation Form (scores ONLY):
- Coherence: """

# Example usage
document = """The quarterly earnings report showed a 15% increase in revenue, 
driven primarily by strong performance in the cloud services division. 
However, operating costs rose by 8%, partially offsetting gains."""

summary = """The company's revenue grew substantially due to cloud services' 
success, though higher operating expenses moderated the overall financial 
improvement."""

print("G-Eval Coherence Prompt:")
print("-" * 50)
print(GEVAL_COHERENCE_PROMPT.format(document=document, summary=summary))

## Probability-Weighted Scoring

Instead of discrete scores, use token probabilities to compute expected values.

In [None]:
def probability_weighted_score(score_probs: dict[int, float]) -> float:
    """
    Compute expected score from probability distribution.
    
    score_probs: {score: probability} e.g., {1: 0.1, 2: 0.2, 3: 0.5, 4: 0.15, 5: 0.05}
    """
    return sum(score * prob for score, prob in score_probs.items())

# Example from the book: two summaries with same discrete score but different distributions
summary_a_probs = {1: 0.0, 2: 0.05, 3: 0.70, 4: 0.20, 5: 0.05}  # Confident 3
summary_b_probs = {1: 0.0, 2: 0.35, 3: 0.55, 4: 0.08, 5: 0.02}  # Uncertain 2-3

score_a = probability_weighted_score(summary_a_probs)
score_b = probability_weighted_score(summary_b_probs)

print("Both would receive discrete score of 3, but:")
print(f"  Summary A weighted score: {score_a:.2f}")
print(f"  Summary B weighted score: {score_b:.2f}")
print(f"\n^ Probability weighting correctly ranks A > B")

## Implementing LLM-as-a-Judge

Using OpenAI's API with logprobs for probability-weighted scoring.

In [None]:
from openai import OpenAI

client = OpenAI()

def geval_score(document: str, summary: str, model: str = "gpt-4o-mini") -> dict:
    """
    G-Eval style scoring with probability weighting.
    Returns discrete score and weighted score.
    """
    prompt = GEVAL_COHERENCE_PROMPT.format(document=document, summary=summary)
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1,
        logprobs=True,
        top_logprobs=5
    )
    
    # Extract token and logprobs
    choice = response.choices[0]
    discrete_score = int(choice.message.content.strip())
    
    # Build probability distribution over valid scores
    score_probs = {i: 0.0 for i in range(1, 6)}
    if choice.logprobs and choice.logprobs.content:
        for item in choice.logprobs.content[0].top_logprobs:
            token = item.token.strip()
            if token in ["1", "2", "3", "4", "5"]:
                score_probs[int(token)] = np.exp(item.logprob)
    
    # Normalize probabilities
    total = sum(score_probs.values())
    if total > 0:
        score_probs = {k: v / total for k, v in score_probs.items()}
    
    weighted_score = probability_weighted_score(score_probs)
    
    return {
        "discrete_score": discrete_score,
        "weighted_score": weighted_score,
        "probabilities": score_probs
    }

In [None]:
# Test G-Eval scoring
result = geval_score(document, summary)

print(f"Document: {document[:60]}...")
print(f"Summary: {summary[:60]}...")
print(f"\nDiscrete score: {result['discrete_score']}")
print(f"Weighted score: {result['weighted_score']:.2f}")
print(f"\nProbability distribution:")
for score, prob in result['probabilities'].items():
    bar = "█" * int(prob * 20)
    print(f"  {score}: {prob:.2%} {bar}")

## Pairwise Comparison

For higher correlation with human preferences, compare two responses directly.

In [None]:
PAIRWISE_PROMPT = """Please act as an impartial judge and evaluate the quality of the 
responses provided by two AI assistants to the user's question.

Your evaluation should consider correctness, helpfulness, and relevance.

Avoid any position biases and ensure that the order in which the responses 
were presented does not influence your decision.

[User Question]
{question}

[Assistant A's Answer]
{answer_a}

[Assistant B's Answer]
{answer_b}

After providing your explanation, output your final verdict by strictly 
following this format: "[[A]]" if assistant A is better, "[[B]]" if 
assistant B is better, and "[[C]]" for a tie."""

def pairwise_judge(
    question: str, 
    answer_a: str, 
    answer_b: str,
    model: str = "gpt-4o-mini"
) -> dict:
    """Judge which response is better using pairwise comparison."""
    prompt = PAIRWISE_PROMPT.format(
        question=question,
        answer_a=answer_a,
        answer_b=answer_b
    )
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500
    )
    
    content = response.choices[0].message.content
    
    # Extract verdict
    if "[[A]]" in content:
        verdict = "A"
    elif "[[B]]" in content:
        verdict = "B"
    elif "[[C]]" in content:
        verdict = "tie"
    else:
        verdict = "unknown"
    
    return {"verdict": verdict, "reasoning": content}

In [None]:
# Test pairwise comparison
question = "What is the capital of France?"
answer_a = "The capital of France is Paris."
answer_b = "Paris is the capital city of France, known for the Eiffel Tower and rich cultural heritage."

result = pairwise_judge(question, answer_a, answer_b)
print(f"Question: {question}")
print(f"\nAssistant A: {answer_a}")
print(f"Assistant B: {answer_b}")
print(f"\nVerdict: {result['verdict']}")
print(f"\nReasoning:\n{result['reasoning']}")

## Mitigating Position Bias

Run comparisons twice with swapped positions to detect and correct for position bias.

In [None]:
def pairwise_judge_debiased(
    question: str,
    answer_a: str,
    answer_b: str,
    model: str = "gpt-4o-mini"
) -> dict:
    """
    Run pairwise comparison twice with swapped positions.
    Only return a verdict if both runs agree.
    """
    # Original order: A first, B second
    result1 = pairwise_judge(question, answer_a, answer_b, model)
    
    # Swapped order: B first, A second
    result2 = pairwise_judge(question, answer_b, answer_a, model)
    
    # Map swapped result back
    swapped_verdict = {"A": "B", "B": "A", "tie": "tie"}.get(result2["verdict"], "unknown")
    
    # Check agreement
    if result1["verdict"] == swapped_verdict:
        confident = True
        final_verdict = result1["verdict"]
    else:
        confident = False
        final_verdict = "inconclusive"
    
    return {
        "verdict": final_verdict,
        "confident": confident,
        "original_order": result1["verdict"],
        "swapped_order": result2["verdict"]
    }

# Test debiased comparison
result = pairwise_judge_debiased(question, answer_a, answer_b)
print(f"Original order verdict: {result['original_order']}")
print(f"Swapped order verdict: {result['swapped_order']} (mapped back)")
print(f"Final verdict: {result['verdict']}")
print(f"Confident: {result['confident']}")

## Reference-Guided Grading

For math/reasoning tasks, generate the reference answer separately to avoid context contamination.

In [None]:
REFERENCE_GUIDED_PROMPT = """Please act as an impartial judge and evaluate the quality 
of the responses provided by two AI assistants to the user's question.

Your evaluation should consider correctness and helpfulness. You will be given 
a reference answer, assistant A's answer, and assistant B's answer.

Your job is to evaluate which assistant's answer is better.

Begin your evaluation by comparing both assistants' answers with the reference 
answer. Identify and correct any mistakes.

[User Question]
{question}

[Reference Answer]
{reference}

[Assistant A's Answer]
{answer_a}

[Assistant B's Answer]
{answer_b}

After providing your explanation, output your final verdict: "[[A]]" if 
assistant A is better, "[[B]]" if assistant B is better, "[[C]]" for a tie."""

def reference_guided_judge(
    question: str,
    answer_a: str,
    answer_b: str,
    model: str = "gpt-4o-mini"
) -> dict:
    """
    Two-phase evaluation:
    1. Generate reference answer in clean context
    2. Compare candidates against reference
    """
    # Phase 1: Generate reference answer
    ref_response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": question}],
        max_tokens=500
    )
    reference = ref_response.choices[0].message.content
    
    # Phase 2: Compare with reference
    prompt = REFERENCE_GUIDED_PROMPT.format(
        question=question,
        reference=reference,
        answer_a=answer_a,
        answer_b=answer_b
    )
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500
    )
    
    content = response.choices[0].message.content
    
    if "[[A]]" in content:
        verdict = "A"
    elif "[[B]]" in content:
        verdict = "B"
    elif "[[C]]" in content:
        verdict = "tie"
    else:
        verdict = "unknown"
    
    return {
        "verdict": verdict,
        "reference": reference,
        "reasoning": content
    }

In [None]:
# Test reference-guided grading on a math problem
math_question = "What is 15% of 80?"
answer_correct = "15% of 80 is 12."
answer_wrong = "15% of 80 is 15."  # Common mistake

result = reference_guided_judge(math_question, answer_correct, answer_wrong)
print(f"Question: {math_question}")
print(f"\nAssistant A: {answer_correct}")
print(f"Assistant B: {answer_wrong}")
print(f"\nReference: {result['reference']}")
print(f"\nVerdict: {result['verdict']}")

## Structured Output for Evaluation

Use JSON schemas to ensure consistent, parseable outputs. Note: reasoning must come before score.

In [None]:
from pydantic import BaseModel

class EvaluationResult(BaseModel):
    """Schema for structured evaluation output."""
    reasoning_steps: list[str]  # Must come BEFORE score (autoregressive ordering)
    score: int

STRUCTURED_PROMPT = """Evaluate this summary for coherence on a scale of 1-5.

Coherence: The summary should be well-structured, well-organized, and build 
from sentence to sentence into a coherent body of information.

Document: {document}
Summary: {summary}

Provide step-by-step reasoning, then assign a score."""

def structured_eval(document: str, summary: str, model: str = "gpt-4o-mini") -> dict:
    """Structured evaluation with reasoning before score."""
    response = client.beta.chat.completions.parse(
        model=model,
        messages=[{
            "role": "user",
            "content": STRUCTURED_PROMPT.format(document=document, summary=summary)
        }],
        response_format=EvaluationResult
    )
    
    result = response.choices[0].message.parsed
    return {
        "reasoning": result.reasoning_steps,
        "score": result.score
    }

# Test structured evaluation
result = structured_eval(document, summary)
print("Reasoning steps:")
for i, step in enumerate(result["reasoning"], 1):
    print(f"  {i}. {step}")
print(f"\nScore: {result['score']}")

## Using Anthropic's Claude as a Judge

Claude can also serve as an LLM judge with similar prompting patterns.

In [None]:
from anthropic import Anthropic

anthropic_client = Anthropic()

def claude_judge(
    question: str,
    answer_a: str,
    answer_b: str,
    model: str = "claude-sonnet-4-20250514"
) -> dict:
    """Pairwise comparison using Claude."""
    prompt = PAIRWISE_PROMPT.format(
        question=question,
        answer_a=answer_a,
        answer_b=answer_b
    )
    
    response = anthropic_client.messages.create(
        model=model,
        max_tokens=500,
        messages=[{"role": "user", "content": prompt}]
    )
    
    content = response.content[0].text
    
    if "[[A]]" in content:
        verdict = "A"
    elif "[[B]]" in content:
        verdict = "B"
    elif "[[C]]" in content:
        verdict = "tie"
    else:
        verdict = "unknown"
    
    return {"verdict": verdict, "reasoning": content}

# Test Claude as judge
result = claude_judge(question, answer_a, answer_b)
print(f"Claude's verdict: {result['verdict']}")

## Exercises

1. Compute Spearman's ρ and Kendall's τ between two human annotators' scores. When do they disagree?

2. Implement a verbosity bias test: create two responses with identical content but different lengths. Does the judge prefer the longer one?

3. Build a multi-dimensional rubric that evaluates responses on helpfulness, accuracy, and tone. Use geometric mean to combine scores.

In [None]:
# Exercise 2: Verbosity bias test
concise_answer = "The capital of France is Paris."
verbose_answer = """The capital of France is Paris. Paris is a beautiful city 
located in the north-central part of France. It is known for many famous 
landmarks including the Eiffel Tower, the Louvre Museum, and Notre-Dame 
Cathedral. The city has been the capital since the 10th century and remains 
the political, economic, and cultural center of France today."""

result = pairwise_judge(
    "What is the capital of France?",
    concise_answer,
    verbose_answer
)
print(f"Concise: {concise_answer}")
print(f"Verbose: {verbose_answer[:50]}...")
print(f"\nVerdict: {result['verdict']}")
print("^ Does the judge exhibit verbosity bias?")