Follow: [github_vikasnar/Bleu](https://github.com/vikasnar/Bleu/blob/master/calculatebleu.py)

In [1]:
import sys
import codecs
import os
import math
import operator
import json
from functools import reduce

## Load data

In [59]:
cand_file = './candidate.txt'
ref_dir = './testSet'

In [60]:
def fetch_data(cand, ref):
    """ Store each reference and candidate sentences as a list """
    references = []
    if '.txt' in ref:
        reference_file = codecs.open(ref, 'r', 'utf-8')
        references.append(reference_file.readlines())
    else:
        for root, dirs, files in os.walk(ref):
            for f in files:
                reference_file = codecs.open(os.path.join(root, f), 'r', 'utf-8')
                references.append(reference_file.readlines())
    candidate_file = codecs.open(cand, 'r', 'utf-8')
    candidate = candidate_file.readlines()
    return candidate, references

In [61]:
candidate, references = fetch_data(cand_file, ref_dir)

In [12]:
candidate

['It is a guide to action which ensures that the military always obeys the commands of the party.']

In [13]:
references

[['It is the guiding principle which guarantees the military forces always being under the command of the Party.'],
 ['It is the practical guide for the army always to heed the directions of the party.'],
 ['It is a guide to action that ensures that the military will forever heed Party commands.']]

## 直接使用封装好的模块

In [10]:
# 几何平均数
def geometric_mean(precisions):
    return (reduce(operator.mul, precisions)) ** (1.0 / len(precisions))

- reduce() 函数会对参数序列中元素进行累积
- 几何平均： $\left ( \prod_{n}^{i=1}x_{i} \right )=\sqrt[n]{x_{1}x_{2}...x_{n}}$

In [5]:
from calculatebleu import count_ngram, geometric_mean

precisions = []
for i in range(4):
    pr, bp = count_ngram(candidate, references, i+1)
    print("pr: {}, bp: {}".format(pr, bp))
    precisions.append(pr)
bleu = geometric_mean(precisions) * bp
print('BLEU: ', bleu)

pr: 0.8888888888888888, bp: 1.0
pr: 0.5882352941176471, bp: 1.0
pr: 0.4375, bp: 1.0
pr: 0.26666666666666666, bp: 1.0
BLEU:  0.49697705300310346


## NLTK中的评估方法
nltk中有封装好的包，输入要求转换为单词列表  
从下面的例子可以看出，结果和自己的代码计算出来的结果是一致的

In [49]:
from nltk.translate.bleu_score import sentence_bleu

In [63]:
ref_list = []
for reference in references:     # 这里有三个reference
    ref_sentence = reference[0]  # str， 从列表中拿出来的句子
    words = ref_sentence.strip().split()
    ref_list.append(words)

cand_sentence = candidate[0]
cand_list = cand_sentence.strip().split()

In [68]:
sentence_bleu(ref_list, cand_list)

0.4969770530031034

## 分离测试

In [118]:
n_grams = 2
n = n_grams

In [119]:
clipped_count = 0
count = 0
r = 0
c = 0

# Calculate precision for each sentence
ref_counts = []
ref_lengths = []
# Build dictionary of ngram counts
for reference in references:
    ref_sentence = reference[0]
    ngram_d = {}
    words = ref_sentence.strip().split()
    
    ref_lengths.append(len(words))
    limits = len(words) - n + 1
    
    # loop through the sentance consider the ngram length
    for i in range(limits):
        ngram = ' '.join(words[i:i+n]).lower()
        if ngram in ngram_d.keys():
            ngram_d[ngram] += 1
        else:
            ngram_d[ngram] = 1
    ref_counts.append(ngram_d)

ref_counts: 是一个字典列表，每个字典为一句话的词块统计结果,   
ref_lengths: 记录了三句话中，n_gram=2时的切割后的词块数量，也既是ref_counts中每一个字典的key的个数   

In [122]:
# candidate
cand_sentence = candidate[0]
cand_dict = {}
words = cand_sentence.strip().split()
limits = len(words) - n + 1
for i in range(0, limits):
    ngram = ' '.join(words[i:i + n]).lower()
    if ngram in cand_dict:
        cand_dict[ngram] += 1
    else:
        cand_dict[ngram] = 1

cand_dict: 字典, candidate的词块统计结果  
limits： cand_dict中key的个数，也是完全匹配的最高个数

In [129]:
def clip_count(cand_d, ref_ds):
    """Count the clip count for each ngram considering all references"""
    count = 0
    for m in cand_d.keys():
        m_w = cand_d[m]
        m_max = 0
        for ref in ref_ds:
            if m in ref:
                m_max = max(m_max, ref[m])
        m_w = min(m_w, m_max)
        count += m_w
    return count

clipped_count += clip_count(cand_dict, ref_counts)
count += limits
clipped_count

40

In [133]:
ref_lengths

[18, 16, 16]

In [134]:
def best_length_match(ref_l, cand_l):
    """Find the closest length of reference to that of candidate"""
    least_diff = abs(cand_l-ref_l[0])
    best = ref_l[0]
    for ref in ref_l:
        if abs(cand_l-ref) < least_diff:
            least_diff = abs(cand_l-ref)
            best = ref
    return best

r += best_length_match(ref_lengths, len(words))
c += len(words) 

In [136]:
if clipped_count == 0:
    pr = 0
else:
    pr = float(clipped_count) / count
    
def brevity_penalty(c, r):
    if c > r:
        bp = 1
    else:
        bp = math.exp(1-(float(r)/c))
    return bp

bp = brevity_penalty(c, r)

In [137]:
geometric_mean([pr]) * bp

0.5882352941176471