# Chapter 3: BERTScore and COMET

Hands-on implementation of semantic evaluation metrics.

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np

## Why Lexical Metrics Fail

BLEU/ROUGE treat words as atomic symbols. Synonyms score as poorly as random words.

In [None]:
# Example from the book
reference = "people like foreign cars"
candidate1 = "People like visiting places abroad."  # Different topic!
candidate2 = "Consumers prefer imported cars."       # Same meaning!

def word_overlap(cand, ref):
    cand_words = set(cand.lower().split())
    ref_words = set(ref.lower().split())
    return len(cand_words & ref_words) / len(ref_words)

print(f"Reference: '{reference}'")
print(f"\nCandidate 1 (wrong topic): '{candidate1}'")
print(f"  Word overlap: {word_overlap(candidate1, reference):.0%}")
print(f"\nCandidate 2 (same meaning): '{candidate2}'")
print(f"  Word overlap: {word_overlap(candidate2, reference):.0%}")
print("\n^ Lexical metrics penalize valid paraphrases!")

## Embeddings: Words as Vectors

In embedding space, "attorney" and "lawyer" are neighbors, not strangers.

In [None]:
# Load a pretrained encoder model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

print(f"Loaded {model_name}")
print(f"Embedding dimension: {model.config.hidden_size}")

In [None]:
def get_embeddings(text: str) -> torch.Tensor:
    """Get token embeddings using mean pooling."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Mean pooling over tokens (excluding padding)
    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs["attention_mask"]
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = (token_embeddings * mask_expanded).sum(dim=1)
    token_counts = mask_expanded.sum(dim=1).clamp(min=1e-9)
    
    return sum_embeddings / token_counts

def cosine_similarity(emb1: torch.Tensor, emb2: torch.Tensor) -> float:
    """Compute cosine similarity between two embeddings."""
    return F.cosine_similarity(emb1, emb2).item()

# Test with book examples
emb_ref = get_embeddings(reference)
emb_c1 = get_embeddings(candidate1)
emb_c2 = get_embeddings(candidate2)

print(f"Cosine similarity (semantic):")
print(f"  Candidate 1 (wrong topic): {cosine_similarity(emb_ref, emb_c1):.3f}")
print(f"  Candidate 2 (same meaning): {cosine_similarity(emb_ref, emb_c2):.3f}")
print("\n^ Semantic metrics recognize paraphrases!")

## BERTScore: Token-Level Greedy Matching

For each token in one sentence, find its best semantic match in the other.

In [None]:
def get_token_embeddings(text: str) -> tuple[torch.Tensor, list[str]]:
    """Get individual token embeddings."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get tokens (skip [CLS] and [SEP])
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])[1:-1]
    embeddings = outputs.last_hidden_state[0, 1:-1, :]  # Skip special tokens
    
    return embeddings, tokens

# Example from the book
reference = "The weather is cold today."
candidate = "It is freezing today."

ref_emb, ref_tokens = get_token_embeddings(reference)
cand_emb, cand_tokens = get_token_embeddings(candidate)

print(f"Reference tokens: {ref_tokens}")
print(f"Candidate tokens: {cand_tokens}")

In [None]:
def pairwise_cosine_similarity(emb1: torch.Tensor, emb2: torch.Tensor) -> torch.Tensor:
    """Compute pairwise cosine similarity matrix."""
    # Normalize embeddings
    emb1_norm = F.normalize(emb1, p=2, dim=1)
    emb2_norm = F.normalize(emb2, p=2, dim=1)
    # Compute similarity matrix
    return torch.mm(emb1_norm, emb2_norm.t())

# Compute similarity matrix
sim_matrix = pairwise_cosine_similarity(ref_emb, cand_emb)

print("Pairwise similarity matrix (reference × candidate):")
print(f"Shape: {sim_matrix.shape} ({len(ref_tokens)} ref × {len(cand_tokens)} cand)\n")

# Display as table
import pandas as pd
df = pd.DataFrame(
    sim_matrix.numpy(),
    index=ref_tokens,
    columns=cand_tokens
)
print(df.round(3).to_string())

In [None]:
def bertscore_components(ref_emb: torch.Tensor, cand_emb: torch.Tensor) -> dict:
    """
    Calculate BERTScore precision, recall, and F1.
    
    Recall: For each reference token, find best match in candidate.
    Precision: For each candidate token, find best match in reference.
    """
    sim_matrix = pairwise_cosine_similarity(ref_emb, cand_emb)
    
    # Recall: max over candidate for each reference token
    recall_scores = sim_matrix.max(dim=1).values
    recall = recall_scores.mean().item()
    
    # Precision: max over reference for each candidate token
    precision_scores = sim_matrix.max(dim=0).values
    precision = precision_scores.mean().item()
    
    # F1
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "recall_scores": recall_scores,
        "precision_scores": precision_scores
    }

result = bertscore_components(ref_emb, cand_emb)

print("BERTScore (unscaled):")
print(f"  Precision: {result['precision']:.3f}")
print(f"  Recall: {result['recall']:.3f}")
print(f"  F1: {result['f1']:.3f}")

print("\nRecall breakdown (best match for each reference token):")
for token, score in zip(ref_tokens, result['recall_scores']):
    print(f"  {token:12} -> {score:.3f}")

## Greedy Matching Visualization

Which candidate token best matches each reference token?

In [None]:
# Find best matches
best_matches = sim_matrix.argmax(dim=1)

print("Greedy matching (reference -> best candidate match):")
print("-" * 45)
for i, (ref_tok, best_idx) in enumerate(zip(ref_tokens, best_matches)):
    cand_tok = cand_tokens[best_idx]
    score = sim_matrix[i, best_idx].item()
    print(f"  '{ref_tok}' -> '{cand_tok}' (similarity: {score:.3f})")

print("\n^ 'cold' matches 'freezing' despite no lexical overlap!")

## Using the `bert_score` Library

In practice, use the optimized implementation with IDF weighting and baseline rescaling.

In [None]:
from bert_score import score as bert_score

# Book examples
references = [
    "The weather is cold today.",
    "people like foreign cars"
]
candidates = [
    "It is freezing today.",
    "Consumers prefer imported cars."
]

P, R, F1 = bert_score(candidates, references, lang="en", verbose=False)

print("BERTScore (with IDF weighting & rescaling):")
print("-" * 50)
for i, (cand, ref) in enumerate(zip(candidates, references)):
    print(f"Reference: '{ref}'")
    print(f"Candidate: '{cand}'")
    print(f"  P={P[i]:.3f}, R={R[i]:.3f}, F1={F1[i]:.3f}\n")

## COMET: Learned Evaluation from Human Judgments

COMET uses three inputs: source, candidate, and reference. Unlike BERTScore which uses pretrained embeddings directly, COMET is **trained on human judgment data** from WMT translation tasks.

> **Note:** COMET (`unbabel-comet`) requires `transformers<5.0`. If you're using Python 3.14+ with newer transformers, install COMET in a separate environment:
> ```bash
> uv venv comet-env --python 3.12
> source comet-env/bin/activate
> pip install unbabel-comet
> ```

In [None]:
# COMET requires transformers<5.0 - this cell may fail in newer environments
try:
    from comet import download_model, load_from_checkpoint
    
    # Download and load COMET model
    model_path = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(model_path)
    COMET_AVAILABLE = True
except ImportError:
    print("COMET not available. Install with: pip install unbabel-comet")
    print("Requires transformers<5.0 and Python<3.14")
    COMET_AVAILABLE = False

In [None]:
# Example from the book: German bank translation
data = [
    {
        "src": "Die Bank war voller Kunden.",
        "mt": "The bank was full of customers.",
        "ref": "The financial institution was crowded with clients."
    },
    {
        "src": "Die Bank war voller Kunden.",
        "mt": "The riverside was full of customers.",  # Wrong sense of "Bank"!
        "ref": "The financial institution was crowded with clients."
    }
]

if COMET_AVAILABLE:
    output = comet_model.predict(data, batch_size=2, gpus=0)
    
    print("COMET scores (trained on human judgments):")
    print("-" * 50)
    for i, (sample, score) in enumerate(zip(data, output.scores)):
        print(f"Source: {sample['src']}")
        print(f"MT: {sample['mt']}")
        print(f"Ref: {sample['ref']}")
        print(f"COMET Score: {score:.3f}\n")
else:
    print("COMET Example (requires separate environment):")
    print("-" * 50)
    for sample in data:
        print(f"Source: {sample['src']}")
        print(f"MT: {sample['mt']}")
        print(f"Ref: {sample['ref']}")
        print()
    print("^ COMET would score the first translation higher")
    print("  because it correctly translates 'Bank' as 'bank' (financial)")
    print("  while the second mistranslates it as 'riverside'.")

## Comparing All Metrics

In [None]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Test case: paraphrase with no word overlap
reference = "The attorney filed the legal brief."
candidate = "The lawyer submitted the court document."

print(f"Reference: '{reference}'")
print(f"Candidate: '{candidate}'")
print("\nMetric comparison:")
print("-" * 40)

# BLEU
bleu_result = bleu.compute(predictions=[candidate], references=[[reference]])
print(f"BLEU:      {bleu_result['bleu']:.3f}")

# ROUGE
rouge_result = rouge.compute(predictions=[candidate], references=[reference])
print(f"ROUGE-L:   {rouge_result['rougeL']:.3f}")

# BERTScore
bert_result = bertscore.compute(
    predictions=[candidate], 
    references=[reference], 
    lang="en"
)
print(f"BERTScore: {bert_result['f1'][0]:.3f}")

print("\n^ Semantic metrics capture paraphrase equivalence!")

## Exercises

1. Compute BERTScore for: "The committee approved the proposal yesterday after extensive debate." vs "Yesterday, following lengthy discussions, the proposal received committee approval."

2. Why might COMET give different scores than BERTScore for the same translation?

3. When would you prefer lexical metrics (BLEU/ROUGE) over semantic metrics?

In [None]:
# Exercise 1
ref = "The committee approved the proposal yesterday after extensive debate."
cand = "Yesterday, following lengthy discussions, the proposal received committee approval."

P, R, F1 = bert_score([cand], [ref], lang="en", verbose=False)
print(f"BERTScore F1: {F1[0]:.3f}")
print("High score despite completely different word order!")