# Chapter 2: BLEU and ROUGE

Hands-on implementation of n-gram metrics.

In [None]:
from collections import Counter
import math

## N-grams from Scratch

An n-gram is a contiguous sequence of n tokens.

In [None]:
def get_ngrams(tokens: list[str], n: int) -> list[tuple]:
    """Extract n-grams from a list of tokens."""
    return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

def tokenize(text: str) -> list[str]:
    """Simple whitespace tokenizer with lowercasing."""
    return text.lower().split()

# Example from the book
sentence = "The cat is on the mat"
tokens = tokenize(sentence)

print(f"Tokens: {tokens}")
print(f"Unigrams (n=1): {get_ngrams(tokens, 1)}")
print(f"Bigrams (n=2): {get_ngrams(tokens, 2)}")
print(f"Trigrams (n=3): {get_ngrams(tokens, 3)}")

## The Gaming Problem: Why Modified Precision?

Naive precision can be gamed with repetition.

In [None]:
# Example 2 from the book
candidate = "the the the the the the the"
reference1 = "The cat is on the mat."
reference2 = "There is a cat on the mat."

cand_tokens = tokenize(candidate)
ref1_tokens = tokenize(reference1)
ref2_tokens = tokenize(reference2)

# Naive precision: what fraction of candidate words appear in references?
ref_words = set(ref1_tokens) | set(ref2_tokens)
matches = sum(1 for w in cand_tokens if w in ref_words)
naive_precision = matches / len(cand_tokens)

print(f"Candidate: '{candidate}'")
print(f"Naive precision: {matches}/{len(cand_tokens)} = {naive_precision:.0%}")
print("\n^ This is nonsense but scores 100%!")

## Modified N-gram Precision

BLEU clips each word count by its maximum occurrence in any reference.

In [None]:
def modified_precision(candidate: str, references: list[str], n: int = 1) -> float:
    """
    Calculate modified n-gram precision.
    Clips counts by max occurrence in any single reference.
    """
    cand_ngrams = get_ngrams(tokenize(candidate), n)
    cand_counts = Counter(cand_ngrams)
    
    # Get max count for each n-gram across all references
    max_ref_counts = Counter()
    for ref in references:
        ref_counts = Counter(get_ngrams(tokenize(ref), n))
        for ngram, count in ref_counts.items():
            max_ref_counts[ngram] = max(max_ref_counts[ngram], count)
    
    # Clip candidate counts
    clipped_count = sum(
        min(count, max_ref_counts.get(ngram, 0))
        for ngram, count in cand_counts.items()
    )
    total_count = sum(cand_counts.values())
    
    return clipped_count / total_count if total_count > 0 else 0

# Test on the gaming example
references = [reference1, reference2]
mod_precision = modified_precision(candidate, references, n=1)

print(f"Modified precision: {mod_precision:.2f}")
print(f"'the' appears max 2 times in any reference")
print(f"So: 2/7 = {2/7:.2f}")

## BLEU on the Book's Translation Example

Chinese: 「它是保证军队永远听党指挥的行动指南。」

In [None]:
# Example 1 from the book
candidate1 = "It is a guide to action which ensures that the military always obeys the commands of the party."
candidate2 = "It is to ensure the troops forever hearing the activity guidebook that party direct."

ref1 = "It is a guide to action that ensures that the military will forever heed Party commands."
ref2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party."
ref3 = "It is the practical guide for the army always to heed the directions of the party."

references = [ref1, ref2, ref3]

# Calculate precision for different n-gram levels
print("N-gram precisions:")
print("-" * 40)
for n in range(1, 5):
    p1 = modified_precision(candidate1, references, n)
    p2 = modified_precision(candidate2, references, n)
    print(f"{n}-gram: Candidate1={p1:.3f}, Candidate2={p2:.3f}")

## Geometric vs Arithmetic Mean

Why geometric mean? It has the **zero-product property**.

In [None]:
def arithmetic_mean(values):
    return sum(values) / len(values)

def geometric_mean(values):
    product = 1
    for v in values:
        product *= v
    return product ** (1 / len(values))

# System A: Balanced performance
system_a = [0.80, 0.60, 0.40, 0.20]

# System B: Degenerate - only unigrams work
system_b = [0.80, 0.00, 0.00, 0.00]

print("System A (balanced): p1=0.80, p2=0.60, p3=0.40, p4=0.20")
print(f"  Arithmetic mean: {arithmetic_mean(system_a):.3f}")
print(f"  Geometric mean:  {geometric_mean(system_a):.3f}")

print("\nSystem B (degenerate): p1=0.80, p2=0.00, p3=0.00, p4=0.00")
print(f"  Arithmetic mean: {arithmetic_mean(system_b):.3f}")
print(f"  Geometric mean:  {geometric_mean(system_b):.3f}")
print("\n^ Geometric mean = 0 if ANY component fails!")

## Brevity Penalty

Prevents gaming by outputting only confident words.

In [None]:
def brevity_penalty(candidate_len: int, reference_len: int) -> float:
    """Calculate BLEU brevity penalty."""
    if candidate_len >= reference_len:
        return 1.0
    return math.exp(1 - reference_len / candidate_len)

# Gaming example: very short candidate
candidate_short = "the military party"  # 3 words
reference = ref1  # 16 words

c_len = len(tokenize(candidate_short))
r_len = len(tokenize(reference))

bp = brevity_penalty(c_len, r_len)
precision = modified_precision(candidate_short, [reference], n=1)

print(f"Candidate: '{candidate_short}' ({c_len} words)")
print(f"Reference length: {r_len} words")
print(f"\nUnigram precision: {precision:.2f} (looks perfect!)")
print(f"Brevity penalty: {bp:.4f}")
print(f"Adjusted score: {precision * bp:.4f}")

## Complete BLEU Implementation

In [None]:
def bleu_score(candidate: str, references: list[str], max_n: int = 4) -> dict:
    """
    Calculate BLEU score with breakdown.
    """
    cand_tokens = tokenize(candidate)
    
    # Find closest reference length
    ref_lens = [len(tokenize(ref)) for ref in references]
    closest_ref_len = min(ref_lens, key=lambda r: abs(r - len(cand_tokens)))
    
    # Calculate precisions
    precisions = []
    for n in range(1, max_n + 1):
        p = modified_precision(candidate, references, n)
        precisions.append(p)
    
    # Geometric mean (with smoothing for zeros)
    epsilon = 1e-10
    log_precisions = [math.log(max(p, epsilon)) for p in precisions]
    geo_mean = math.exp(sum(log_precisions) / len(log_precisions))
    
    # Brevity penalty
    bp = brevity_penalty(len(cand_tokens), closest_ref_len)
    
    return {
        "bleu": bp * geo_mean,
        "brevity_penalty": bp,
        "precisions": precisions,
        "geometric_mean": geo_mean,
    }

# Calculate BLEU for both candidates
result1 = bleu_score(candidate1, references)
result2 = bleu_score(candidate2, references)

print("Candidate 1 (good translation):")
print(f"  Precisions: {[f'{p:.3f}' for p in result1['precisions']]}")
print(f"  BLEU: {result1['bleu']:.3f}")

print("\nCandidate 2 (poor translation):")
print(f"  Precisions: {[f'{p:.3f}' for p in result2['precisions']]}")
print(f"  BLEU: {result2['bleu']:.3f}")

## ROUGE: Recall-Oriented Evaluation

BLEU asks: "What fraction of the candidate is correct?" (precision)

ROUGE asks: "What fraction of the reference is covered?" (recall)

In [None]:
def rouge_n(candidate: str, reference: str, n: int = 1) -> dict:
    """
    Calculate ROUGE-N recall, precision, and F1.
    """
    cand_ngrams = Counter(get_ngrams(tokenize(candidate), n))
    ref_ngrams = Counter(get_ngrams(tokenize(reference), n))
    
    # Count matches (clipped)
    matches = sum(
        min(cand_ngrams[ng], ref_ngrams[ng])
        for ng in ref_ngrams
    )
    
    recall = matches / sum(ref_ngrams.values()) if ref_ngrams else 0
    precision = matches / sum(cand_ngrams.values()) if cand_ngrams else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {"recall": recall, "precision": precision, "f1": f1}

# Example from the book
reference = "Google announced new AI features for search."
candidate = "Google revealed AI search capabilities."

r1 = rouge_n(candidate, reference, n=1)
r2 = rouge_n(candidate, reference, n=2)

print(f"Reference: '{reference}'")
print(f"Candidate: '{candidate}'")
print(f"\nROUGE-1: recall={r1['recall']:.3f}, precision={r1['precision']:.3f}, F1={r1['f1']:.3f}")
print(f"ROUGE-2: recall={r2['recall']:.3f}, precision={r2['precision']:.3f}, F1={r2['f1']:.3f}")

## ROUGE-L: Longest Common Subsequence

Matches don't need to be contiguous—just in the same order.

In [None]:
def lcs_length(x: list, y: list) -> int:
    """Calculate length of longest common subsequence."""
    m, n = len(x), len(y)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if x[i-1] == y[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    
    return dp[m][n]

def rouge_l(candidate: str, reference: str) -> dict:
    """Calculate ROUGE-L using longest common subsequence."""
    cand_tokens = tokenize(candidate)
    ref_tokens = tokenize(reference)
    
    lcs = lcs_length(ref_tokens, cand_tokens)
    
    recall = lcs / len(ref_tokens) if ref_tokens else 0
    precision = lcs / len(cand_tokens) if cand_tokens else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {"lcs": lcs, "recall": recall, "precision": precision, "f1": f1}

# Example from the book
reference = "The company announced strong quarterly earnings"
candidate = "Strong earnings were announced by the company"

result = rouge_l(candidate, reference)
print(f"Reference: '{reference}'")
print(f"Candidate: '{candidate}'")
print(f"\nLCS length: {result['lcs']}")
print(f"ROUGE-L recall: {result['recall']:.3f}")

## Using the `evaluate` Library

In practice, use established implementations.

In [None]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

# BLEU expects: predictions (list of str), references (list of list of str)
bleu_result = bleu.compute(
    predictions=[candidate1],
    references=[[ref1, ref2, ref3]]
)

# ROUGE expects: predictions, references (both list of str)
rouge_result = rouge.compute(
    predictions=["Google revealed AI search capabilities."],
    references=["Google announced new AI features for search."]
)

print("BLEU (evaluate library):")
print(f"  Score: {bleu_result['bleu']:.3f}")
print(f"  Precisions: {[f'{p:.3f}' for p in bleu_result['precisions']]}")

print("\nROUGE (evaluate library):")
for key, value in rouge_result.items():
    print(f"  {key}: {value:.3f}")

## Exercises

1. Calculate BLEU for this gaming attempt: Candidate="party" vs Reference="The military follows party commands"

2. Why does ROUGE-2 often return 0 for short paraphrased texts?

3. Implement ROUGE-S (skip-bigram) for the sentence pair in the ROUGE-L example.

In [None]:
# Exercise 1
result = bleu_score("party", ["The military follows party commands"])
print(f"BLEU: {result['bleu']:.4f}")
print(f"Brevity penalty crushes the score: {result['brevity_penalty']:.4f}")