In [1]:
# https://github.com/pltrdy/rouge
from rouge import Rouge

# https://github.com/li-plus/rouge-metric
from rouge_metric import PyRouge

# https://huggingface.co/metrics/rouge
from datasets import load_metric


In [31]:
# Example Input
preds=["Model output 1","Model output 2","Model output 3"]

# Single Reference for each sample
references=["Reference 1","Reference 2","Reference 3"]

# Multiple reference for each sample
multiple_references=[
	["Sample 1 Reference 1","Sample 1 Reference 2","Sample 1 Reference 3"],
	["Sample 2 Reference 1","Sample 2 Reference 2","Sample 2 Reference 3"],
	["Sample 3 Reference 1","Sample 3 Reference 2","Sample 3 Reference 3"]
]


In [3]:
# ROUGE Paper Examples
s1="police killed the gunman"
s2="police kill the gunman"
s3="the gunman kill police"
s4="the gunman police killed"

In [4]:
!pip show rouge

Name: rouge
Version: 1.0.1
Summary: Full Python ROUGE Score Implementation (not a wrapper)
Home-page: http://github.com/pltrdy/rouge
Author: pltrdy
Author-email: pltrdy@gmail.com
License: LICENCE.txt
Location: /home/tslab/anaconda3/envs/comet2020/lib/python3.8/site-packages
Requires: six
Required-by: 


In [5]:
rouge = Rouge()
scores=rouge.get_scores(preds, references, avg=True)
print(scores)

{'rouge-1': {'r': 0.5, 'p': 0.3333333333333333, 'f': 0.3999999952000001}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.5, 'p': 0.3333333333333333, 'f': 0.3999999952000001}}


In [6]:
!pip show rouge_metric

Name: rouge-metric
Version: 1.0.1
Summary: A fast python implementation of full ROUGE metrics for automatic summarization.
Home-page: https://github.com/li-plus/rouge-metric
Author: Jiahao Li
Author-email: liplus17@163.com
License: MIT
Location: /home/tslab/anaconda3/envs/comet2020/lib/python3.8/site-packages
Requires: 
Required-by: 


In [7]:
# Evaluate document-wise ROUGE scores
# skip_gap: The maximum gap between two words in skip-bigram
# mode: 'average', 'individual'
rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True#, skip_gap=4
				,mode='average')
scores=scores = rouge.evaluate(preds, references)
print(scores)

{'rouge-1': {'r': 0.10000000000000002, 'p': 0.030303030303030304, 'f': 0.04651162790697676}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-4': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.10000000000000002, 'p': 0.030303030303030304, 'f': 0.04651162790697676}, 'rouge-w-1.2': {'r': 0.14677992676220694, 'p': 0.045190953800397234, 'f': 0.06910553173165694}, 'rouge-s*': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-su*': {'r': 0.0, 'p': 0.0, 'f': 0.0}}


In [8]:
metric = load_metric("rouge")
scores=metric.compute(predictions=preds, references=references, rouge_types=["rougeL"])
print(scores)

{'rougeL': AggregateScore(low=Score(precision=0.3333333333333333, recall=0.5, fmeasure=0.4000000000000001), mid=Score(precision=0.3333333333333333, recall=0.5, fmeasure=0.4000000000000001), high=Score(precision=0.3333333333333333, recall=0.5, fmeasure=0.4000000000000001))}


# Multiple References

In [9]:
rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True#, skip_gap=4
				,mode='average')
scores = rouge.evaluate(preds, multiple_references)
print(scores)

{'rouge-1': {'r': 0.25, 'p': 0.3333333333333333, 'f': 0.28571428571428575}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-4': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.25, 'p': 0.3333333333333333, 'f': 0.28571428571428575}, 'rouge-w-1.2': {'r': 0.18946457081379978, 'p': 0.3333333333333333, 'f': 0.24160332869337336}, 'rouge-s*': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-su*': {'r': 0.0, 'p': 0.0, 'f': 0.0}}


In [33]:
#Compare Multiple Pred-Ref Pairs Avg and Pred-Refs Pair Score
preds=[]
multiple_references=[
    []
]

print("Rouge")
rouge = Rouge()

#Test for ROUGE-L
avg_scores=[]
for pred,refs in zip(preds,multiple_references):
    pair_scores=[]
    for ref in refs:
        scores=rouge.get_scores([pred], [ref], avg=True)
        # print(scores)
        pair_scores.append(scores["rouge-l"]["f"])
    avg_scores.append(max(pair_scores))
print("Final ROUGE-L F1",sum(avg_scores)/len(avg_scores))

print("rouge_metric - PyRouge")

#mode: individual, average
rouge = PyRouge(rouge_n=(1, 2), rouge_l=True,mode='average')

scores = rouge.evaluate(preds, multiple_references)
print(scores)
print("Multi-refs ROUGE-1",scores["rouge-l"]["f"])
print("Multi-refs ROUGE-L",scores["rouge-1"]["f"])

# scores = rouge.get_scores(preds, multiple_references, avg=True)
# print("Multi-refs",scores)


#Datasets metric
# metric = load_metric("rouge")

# avg_scores=[]
# for pred,refs in zip(preds,multiple_references):
#     pair_scores=[]
#     for ref in refs:
#         scores=metric.compute(predictions=[pred], references=[ref], rouge_types=["rougeL"])
#         pair_scores.append(scores["rougeL"]["f"])
#     avg_scores.append(max(pair_scores))
# print("Final ROUGE-L F1",sum(avg_scores)/len(avg_scores))

# scores=metric.compute(predictions=preds_repeated, references=multiple_references_unsqueeze, rouge_types=["rougeL"])
# print("Multi Pref-ref pair",scores)

# scores=metric.compute(predictions=preds, references=multiple_references, rouge_types=["rougeL"])
# print("Multi-refs",scores)

Rouge
Final ROUGE-L F1 0.3333333283333334
rouge_metric - PyRouge
{'rouge-1': {'r': 0.25, 'p': 0.3333333333333333, 'f': 0.28571428571428575}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.25, 'p': 0.3333333333333333, 'f': 0.28571428571428575}}
Multi-refs ROUGE-1 0.28571428571428575
Multi-refs ROUGE-L 0.28571428571428575
