/
metrics.py
67 lines (54 loc) · 2.16 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# coding: utf-8
"""
This module holds various MT evaluation metrics.
"""
import sacrebleu
def chrf(hypotheses, references):
"""
Character F-score from sacrebleu
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:return:
"""
return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references)
def bleu(hypotheses, references):
"""
Raw corpus BLEU from sacrebleu (without tokenization)
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:return:
"""
return sacrebleu.raw_corpus_bleu(sys_stream=hypotheses,
ref_streams=[references]).score
def token_accuracy(hypotheses, references, level="word"):
"""
Compute the accuracy of hypothesis tokens: correct tokens / all tokens
Tokens are correct if they appear in the same position in the reference.
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:param level: segmentation level, either "word", "bpe", or "char"
:return:
"""
correct_tokens = 0
all_tokens = 0
split_char = " " if level in ["word", "bpe"] else ""
assert len(hypotheses) == len(references)
for hyp, ref in zip(hypotheses, references):
all_tokens += len(hyp)
for h_i, r_i in zip(hyp.split(split_char), ref.split(split_char)):
# min(len(h), len(r)) tokens considered
if h_i == r_i:
correct_tokens += 1
return (correct_tokens / all_tokens)*100 if all_tokens > 0 else 0.0
def sequence_accuracy(hypotheses, references):
"""
Compute the accuracy of hypothesis tokens: correct tokens / all tokens
Tokens are correct if they appear in the same position in the reference.
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:return:
"""
assert len(hypotheses) == len(references)
correct_sequences = sum([1 for (hyp, ref) in zip(hypotheses, references)
if hyp == ref])
return (correct_sequences / len(hypotheses))*100 if hypotheses else 0.0