In [122]:
from nltk.util import ngrams
from collections import Counter
from fractions import Fraction
from nltk.translate.bleu_score import sentence_bleu, closest_ref_length, brevity_penalty
import numpy as np
import math

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# BLEU score

## example from the original paper

### data

First let's get some data from this paper (reference and candidate translations).

In [4]:
rt_raw = [
    'It is a guide to action that ensures that the military will forever heed Party commands',
    'It is the guiding principle which guarantees the military forces always being under the command of the Party',
    'It is the practical guide for the army always to heed the directions of the party'
]

In [13]:
ct_raw = [
    'It is a guide to action which ensures that the military always obeys the commands of the party',
    'It is to insure the troops forever hearing the activity guidebook that party direct'
]

In [8]:
def process_trans(t):
    return t.lower().split()

In [9]:
rt = [process_trans(t) for t in rt_raw]

In [14]:
ct = [process_trans(t) for t in ct_raw]

In [16]:
c1, c2 = ct[0], ct[1]

### modified unigram precision

In [35]:
def get_unigram_count_clip(u, c_count, rt_counts):
    max_ref_count = max([rt_count[u] for rt_count in rt_counts])
    return min(c_count[u], max_ref_count)

def get_unigram_modified_precision(c, rt):
    c_count = Counter(ngrams(c, 1))
    rt_counts = [Counter(ngrams(r, 1)) for r in rt]
    clipped_counts = sum([get_unigram_count_clip(u, c_count, rt_counts) for u in c_count])
    total_counts = len(c)
    return Fraction(clipped_counts, total_counts)

In [39]:
print(get_unigram_modified_precision(c1, rt), get_unigram_modified_precision(c2, rt))

17/18 4/7


### modified ngram precision

In [51]:
def get_count_clip(u, c_count, rt_counts):
    max_ref_count = max([rt_count[u] for rt_count in rt_counts])
    return min(c_count[u], max_ref_count)

def get_mp(c, rt, n):
    mps = []
    for i in range(1, n+1):
        c_count = Counter(ngrams(c, i))
        rt_counts = [Counter(ngrams(r, i)) for r in rt]
        clipped_counts = sum([get_count_clip(u, c_count, rt_counts) for u in c_count])
        total_counts = sum(c_count.values())
        mps.append(Fraction(clipped_counts, total_counts))
    return mps

In [52]:
get_mp(c1, rt, 1), get_mp(c2, rt, 1)

([Fraction(17, 18)], [Fraction(4, 7)])

In [53]:
get_mp(c1, rt, 2), get_mp(c2, rt, 2)

([Fraction(17, 18), Fraction(10, 17)], [Fraction(4, 7), Fraction(1, 13)])

### brevity penalty

It's not quite clear from this article how to compute brevity penalty. We may use `nltk` algorithm (which is quite clear from its source [code](https://www.nltk.org/_modules/nltk/translate/bleu_score.html)). First we compute closest reference solution **`by length`** with `min length` (the last part is important). So in case of `c1` the closest len is `18`, in case of `c2` - `16`. So in case of `c1` we don't have brevity penalty.

In [110]:
[len(r) for r in rt], len(c1), len(c2)

([16, 18, 16], 18, 14)

In [117]:
c1_closest, c2_closest = closest_ref_length(rt, 18), closest_ref_length(rt, len(c2))

In [118]:
c1_closest, c2_closest

(18, 16)

In [119]:
brevity_penalty(c1_closest, len(c1))

1.0

In [120]:
brevity_penalty(c2_closest, len(c2))

0.8668778997501817

In [123]:
math.exp(1 - c2_closest / len(c2))

0.8668778997501817

### BLEU score

Let's finally compute the score and compare it with `nltk`.

#### for `c1`

In [141]:
sentence_bleu(rt, c1, weights=(.5, .5, 0, 0))

0.7453559924999299

In [129]:
p1, p2 = get_mp(c1, rt, 2)

In [130]:
p1, p2

(Fraction(17, 18), Fraction(10, 17))

In [131]:
float(p1)

0.9444444444444444

In [134]:
math.exp(.5 * math.log(float(p1)) + .5 * math.log(float(p2)))

0.7453559924999299

#### for `c2`

In [143]:
sentence_bleu(rt, c2, weights=(.5, .5, 0, 0))

0.18174699151949172

In [136]:
BP = brevity_penalty(c2_closest, len(c2))

In [138]:
p1, p2 = get_mp(c2, rt, 2)

In [139]:
p1, p2

(Fraction(4, 7), Fraction(1, 13))

In [140]:
BP * math.exp(.5 * math.log(float(p1)) + .5 * math.log(float(p2)))

0.18174699151949172

This concludes our analysis of `BLEU` score.