# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cp -R ./drive/MyDrive/Grounding_LM/ ./

In [1]:
import evaluate
import nltk
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import pandas as pd
from tqdm import tqdm

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Helper functions

In [2]:
def read_dataset(dataname, modelname):
    df_actual = pd.read_csv(f"data/{dataname}/test.csv")
    df_generated = pd.read_csv(f"results/{modelname}_large_{dataname}.csv")
    df_merge = pd.merge(df_actual, df_generated, on='id')
    return df_merge

In [5]:
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")
bertscore_metric = evaluate.load("bertscore")

def rougeScore(preds, refs):
    pred_tokens = ["\n".join(nltk.sent_tokenize(str(item).strip())) for item in preds]
    refs_tokens = ["\n".join(nltk.sent_tokenize(item.strip())) for item in refs]
    rogue_scores = rouge_metric.compute(predictions=pred_tokens, references=refs_tokens, use_stemmer=True)
    rouge_dict = {key: value * 100 for key, value in rogue_scores.items()}
    return pd.DataFrame(rouge_dict, index=[0])

def bleuScore(preds, refs):
    bleu_dic = {}
    bleu_dic['bleu-1-grams'] = corpus_bleu(refs, preds, weights=(1.0, 0, 0, 0))
    bleu_dic['bleu-1-2-grams'] = corpus_bleu(refs, preds, weights=(0.5, 0.5, 0, 0))
    bleu_dic['bleu-1-3-grams'] = corpus_bleu(refs, preds, weights=(0.3, 0.3, 0.3, 0))
    bleu_dic['bleu-1-4-grams'] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25))
    sacrebleu = bleu_metric.compute(predictions=preds, references=refs)
    bleu_dic['sacrebleu'] = sacrebleu['score']
    return pd.DataFrame(bleu_dic, index=[0])

def bertScore(preds, refs):
    bert_scores = bertscore_metric.compute(predictions=preds, references=refs, lang="en",model_type="distilbert-base-uncased")
    bert_dict = { 
        "bert_precision": np.mean(bert_scores['precision']),
        "bert_recall":np.mean(bert_scores['recall']),
        "bert_F1": np.mean(bert_scores['f1'])
        }
    return pd.DataFrame(bert_dict, index=[0])

def calculateMetrics(df_data):
    generated_summaries = df_data['generated'].values
    actual_summaries = df_data['summary'].values

    df_rouge = rougeScore(generated_summaries, actual_summaries)
    df_blue = bleuScore(generated_summaries, actual_summaries)
    df_bert = bertScore(generated_summaries, actual_summaries)
    
    df_results = pd.concat([df_rouge, df_blue, df_bert], axis=1)
    return df_results

# Calculate metric scores

In [6]:
datasets = ["xsum", "cnn_dailymail"]
models = ["t5", "bart"]

results = []
for dataset in tqdm(datasets, desc="dataset loop", position=0):
    for model in tqdm(models, desc="model loop", position=1):
        df_data = read_dataset(dataname=dataset, modelname=model)
        df_metrics = calculateMetrics(df_data)
        results.append(df_metrics)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [7]:
df_merge = pd.concat(results, axis=0)
df_merge.index = ['T5-large-XSum', 'BART-large-XSum', 'T5-large-CNN/Dailymail', 'BART-large-CNN/Dailymail']
df_merge.to_csv("summary_metrics.csv")
df_merge

# pd.read_csv("summary_metrics.csv", index_col=0)

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu-1-grams,bleu-1-2-grams,bleu-1-3-grams,bleu-1-4-grams,sacrebleu,bert_precision,bert_recall,bert_F1
T5-large-XSum,40.009396,16.606227,32.047688,32.035912,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896
BART-large-XSum,44.94845,21.598009,36.50578,36.507496,0.205585,6.763446e-155,1.5933239999999998e-185,1.22675e-231,15.327531,0.850004,0.845715,0.847557
T5-large-CNN/Dailymail,23.960417,6.842925,16.562357,20.852092,0.229411,7.144618e-155,1.64661e-185,1.260845e-231,1.257227,0.80856,0.737391,0.770937
BART-large-CNN/Dailymail,26.193286,7.862815,17.641984,22.477502,0.199242,6.658284e-155,1.578413e-185,1.217176e-231,2.055372,0.801691,0.747194,0.773092
