In [1]:
from nltk.translate.bleu_score import sentence_bleu
from collections import Counter
import math

def ngram_counts(tokens, n):
    """Return a Counter of n-grams for the given token list."""
    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))

def modified_precision(candidate, references, n):
    """
    Compute modified n-gram precision p_n.
    candidate: list of tokens for the candidate translation
    references: list of lists of tokens for each reference translation
    n: order of n-gram
    """
    # Count n-grams in candidate
    cand_counts = ngram_counts(candidate, n)
    if not cand_counts:
        return 0.0

    # Count clipped matches
    max_ref_counts = Counter()
    for ref in references:
        ref_counts = ngram_counts(ref, n)
        for ng in cand_counts:
            max_ref_counts[ng] = max(max_ref_counts[ng], ref_counts.get(ng, 0))

    clipped_count = sum(min(cand_counts[ng], max_ref_counts[ng]) for ng in cand_counts)
    total_count   = sum(cand_counts.values())
    return clipped_count / total_count

def brevity_penalty(candidate, references):
    """
    Compute brevity penalty BP.
    candidate: list of tokens for candidate
    references: list of lists of tokens for each reference
    """
    c = len(candidate)
    print(f"len(c) = {c}")
    # find reference length closest to c (break ties by choosing shorter)
    ref_lens = [len(ref) for ref in references]
    print(f"len(refs) = {ref_lens}")
    
    closest = min(ref_lens, key=lambda r: (abs(r-c), r))
    if c >= closest:
        return 1.0
    else:
        return math.exp(1 - closest / c)

def compute_bleu(candidate, references, weights):
    """
    Compute BLEU score for a single sentence.
    candidate: list of tokens for candidate translation
    references: list of lists of tokens for each reference translation
    weights: list of lambda_n weights (e.g. [0.5, 0.5, 0, 0])
    """
    # 1. modified precisions for each n
    p_ns = [modified_precision(candidate, references, n+1) 
            for n, _ in enumerate(weights)]
    print("\n".join(f"p_{n+1} = {p_n}" for n, p_n in enumerate(p_ns)))

    # 2. brevity penalty
    bp = brevity_penalty(candidate, references)
    print(f"BP = {bp}")

    # 3. geometric mean of precisions
    #    if any p_n is zero, log(p_n) -> -inf, BLEU -> 0
    log_p_ns = []
    for p, w in zip(p_ns, weights):
        if w == 0:
            continue
        if p == 0:
            return 0.0    # any required p_n = 0 → BLEU=0
        log_p_ns.append(w * math.log(p))
    geo_mean = math.exp(sum(log_p_ns))

    # 4. final BLEU
    return bp * geo_mean

In [2]:
candidate_1 = "there is a need for adequate and predictable resources".split()
candidate_2 = "resources be sufficient and predictable to".split()
references = [
    "resources have to be sufficient and they have to be predictable".split(),
    "adequate and predictable resources are required".split()
]
weights = [0.5, 0.5, 0.0, 0.0]  # only 1- and 2-grams
c1_bleu_score = compute_bleu(candidate_1, references, weights)
c2_bleu_score = compute_bleu(candidate_2, references, weights)

print("BLUE using my implementation:")
print(f"c1 BLEU = {c1_bleu_score:.3f}\nc2 BLEU = {c2_bleu_score:.3f}")

p_1 = 0.4444444444444444
p_2 = 0.375
p_3 = 0.2857142857142857
p_4 = 0.16666666666666666
len(c) = 9
len(refs) = [11, 6]
BP = 0.800737402916808
p_1 = 1.0
p_2 = 0.6
p_3 = 0.25
p_4 = 0.0
len(c) = 6
len(refs) = [11, 6]
BP = 1.0
BLUE using my implementation:
c1 BLEU = 0.327
c2 BLEU = 0.775


In [3]:
print("BLEU using NLTK:")
print(f"c1 BLEU = {sentence_bleu(references, candidate_1, weights):.3f}")
print(f"c2 BLEU = {sentence_bleu(references, candidate_2, weights):.3f}")

BLEU using NLTK:
c1 BLEU = 0.327
c2 BLEU = 0.775


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [5]:
print("BLEU with respect to reference 2 only:")

print("\n\nMy implementation:")
print(f"c1 BLEU = {compute_bleu(candidate_1, [references[1]], weights):.3f}")
print(f"c2 BLEU = {compute_bleu(candidate_2, [references[1]], weights):.3f}")

print("\n\nNLTK:")
print(f"c1 BLEU = {sentence_bleu([references[1]], candidate_1, weights):.3f}")
print(f"c2 BLEU = {sentence_bleu([references[1]], candidate_2, weights):.3f}")

BLEU with respect to reference 2 only:


My implementation:
p_1 = 0.4444444444444444
p_2 = 0.375
p_3 = 0.2857142857142857
p_4 = 0.16666666666666666
len(c) = 9
len(refs) = [6]
BP = 1.0
c1 BLEU = 0.408
p_1 = 0.5
p_2 = 0.2
p_3 = 0.0
p_4 = 0.0
len(c) = 6
len(refs) = [6]
BP = 1.0
c2 BLEU = 0.316


NLTK:
c1 BLEU = 0.408
c2 BLEU = 0.316
