## Self-defined ROUGE

In [13]:
#!/usr/bin/env python
# 
# File Name : rouge.py
#
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
#
# Creation Date : 2015-01-07 06:03
# Author : Ramakrishna Vedantam <vrama91@vt.edu>

import numpy as np
import pdb

def my_lcs(string, sub):
    """
    Calculates longest common subsequence for a pair of tokenized strings
    :param string : list of str : tokens from a string split using whitespace
    :param sub : list of str : shorter string, also split using whitespace
    :returns: length (list of int): length of the longest common subsequence between the two strings

    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
    """
    if(len(string)< len(sub)):
        sub, string = string, sub

    lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]

    for j in range(1,len(sub)+1):
        for i in range(1,len(string)+1):
            if(string[i-1] == sub[j-1]):
                lengths[i][j] = lengths[i-1][j-1] + 1
            else:
                lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])

    return lengths[len(string)][len(sub)]

class Rouge():
    '''
    Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set

    '''
    def __init__(self):
        # vrama91: updated the value below based on discussion with Hovey
        self.beta = 1.2

    def calc_score(self, candidate, refs):
        """
        Compute ROUGE-L score given one candidate and references for an image
        :param candidate: str : candidate sentence to be evaluated
        :param refs: list of str : COCO reference sentences for the particular image to be evaluated
        :returns score: int (ROUGE-L score for the candidate evaluated against references)
        """
        assert(len(candidate)==1)	
        assert(len(refs)>0)         
        prec = []
        rec = []

        # split into tokens
        token_c = candidate[0].split(" ")
    	
        for reference in refs:
            # split into tokens
            token_r = reference.split(" ")
            # compute the longest common subsequence
            lcs = my_lcs(token_r, token_c)
            prec.append(lcs/float(len(token_c)))
            rec.append(lcs/float(len(token_r)))

        prec_max = max(prec)
        rec_max = max(rec)

        if(prec_max!=0 and rec_max !=0):
            score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
        else:
            score = 0.0
        return score

    def compute_score(self, gts, res):
        """
        Computes Rouge-L score given a set of reference and candidate sentences for the dataset
        Invoked by evaluate_captions.py 
        :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
        :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
        :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
        """
        # assert(gts.keys() == res.keys())
        # imgIds = gts.keys()
        #
        # score = []
        # for id in imgIds:
        #     hypo = res[id]
        #     ref  = gts[id]

        assert (len(gts) == len(res))

        score = []
        for idx in range(len(gts)):
            hypo = [res[idx]]
            ref  = [gts[idx]]

            score.append(self.calc_score(hypo, ref))

            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) > 0)

        average_score = np.mean(np.array(score))
        return average_score, np.array(score)

    def method(self):
        return "Rouge"


In [14]:
rouge=Rouge()

In [19]:
hyp_path="/home/yued/phd/LRD/lrd_summ/result/textrank.txt"
ref_path="/home/yued/phd/LRD/lrd_summ/result/ref.txt"
with open(hyp_path,'r') as f:
    hyps = f.readlines()
with open(ref_path,'r') as f:
    refs = f.readlines()

In [20]:
print(rouge.compute_score(refs,hyps))

(0.18408339989737207, array([0.14124107, 0.18305173, 0.19205397, ..., 0.20318489, 0.17531193,
       0.15565479]))


In [21]:
hyp_path="/home/yued/phd/LRD/lrd_summ/result/textrank.tokenized"
ref_path="/home/yued/phd/LRD/lrd_summ/result/ref.tokenized"
with open(hyp_path,'r') as f:
    hyps = f.readlines()
with open(ref_path,'r') as f:
    refs = f.readlines()
print(rouge.compute_score(refs,hyps))

(0.2050288622385424, array([0.1568215 , 0.19615274, 0.21642576, ..., 0.23908572, 0.19167629,
       0.20697546]))


## ROUGE package

In [30]:
# creare the data as pyrouge needs
with open("/home/yued/phd/LRD/lrd_summ/result/textrank.txt",'r') as f:
    hyps=f.readlines()
for i,h in enumerate(hyps):
    with open("/home/yued/phd/LRD/lrd_summ/result/hyps/%d_decoded.txt"%i,'w') as f:
        if len(h)==0: #deal with empty lines
            h='n'
        f.write(h)

with open("/home/yued/phd/LRD/lrd_summ/result/ref.txt",'r') as f:
    refs=f.readlines()
for i,h in enumerate(refs):
    with open("/home/yued/phd/LRD/lrd_summ/result/refs/%d_reference.txt"%i,'w') as f:
        f.write(h)


In [26]:
from pyrouge import Rouge155
r = Rouge155()
# set directories
r.system_dir = "/home/yued/phd/LRD/lrd_summ/result/hyps/"
r.model_dir = "/home/yued/phd/LRD/lrd_summ/result/refs/"
 
# define the patterns
r.system_filename_pattern = '(\d+)_decoded.txt'
r.model_filename_pattern = '#ID#_reference.txt'
 
# use default parameters to run the evaluation
output = r.convert_and_evaluate()
print(output)
output_dict = r.output_to_dict(output)

Exception: Cannot set system directory because the path decoded/ does not exist.

## file2rouge

In [None]:
import files2rouge
files2rouge.run(hyp_path, ref_path)