In [1]:
!pip install evaluate
!pip install rouge-score
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.0.0
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 k

In [2]:
import os
import numpy as np
from rouge_score import rouge_scorer
from evaluate import load

In [None]:
# Only run this if you are interested in calculating BertScore as a metric
# Requirements: 1.5GB+ download
BERT = load("bertscore")

In [15]:
FOLDER = "./results/"
REFERENCE = "Given_Summaries/"
EVAL_INDEX = 480 # The unseen test data index

MODELS = ["TextRank", "GPT2", "DistilGPT", "DistilGPT_filtered", "T5_8epoch", "T5_16epoch", "ChatGPTPrompt"]
METRICS = ["rouge1", "rougeL", "BertScore"]

def load_articles(PATH="", begin_from=0):
    files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])

    flist = []
    for file in files[begin_from:]:
        with open(os.path.join(PATH, file), 'r') as f:
            text = f.read()
            flist.append(text)
    
    return flist

In [6]:
def score(model="T5_16epoch", metric="rouge1"):
    reference = load_articles(FOLDER+REFERENCE, EVAL_INDEX)
    summary = load_articles(FOLDER+"Generated_"+model, EVAL_INDEX)
    n = len(reference)
    results = {'precision': [], 'recall': [], 'f1': []}

    # ROUGE evaluation
    if metric=="rouge1" or metric=="rougeL":
        scorer = rouge_scorer.RougeScorer([metric], use_stemmer=True)
        for i in range(n):
            score = scorer.score(target=reference[i], prediction=summary[i])
            precision, recall, fmeasure = score[metric]
            results['precision'].append(precision)
            results['recall'].append(recall)
            results['f1'].append(fmeasure)
            
    # BertScore evaluation
    if metric=="BertScore":
        results = BERT.compute(predictions=summary, references=reference, lang="en")
        
    return results

In [7]:
def evaluate():
    for metric in METRICS:
        print("Metric = {}".format(metric))
        for model in MODELS:
            print("\tModel = {}".format(model))
            results = results = score(model=model, metric=metric)
            print("\t\tPrecision: {}".format(round(np.average(results['precision']), 3)))
            print("\t\tRecall: {}".format(round(np.average(results['recall']), 3)))
            print("\t\tF1: {}".format(round(np.average(results['f1']), 3)))

In [12]:
evaluate()

Metric = rouge1
	Model = TextRank


FileNotFoundError: ignored