# Google Colab setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cp -R ./drive/MyDrive/Grounding_LM/ ./

# Packages and imports

In [106]:
from datasets import load_from_disk
import evaluate
import nltk
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import pandas as pd
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [60]:
# Test dataset
df_xsum = pd.read_csv("data/xsum/test.csv")

# Generated summaries
df_t5 = pd.read_csv("results/T5_large_xsum.csv")
df_bart = pd.read_csv("results/bart_large_xsum.csv")

# Merge with test dataset on id column
df_t5 = pd.merge(df_xsum, df_t5, on='id')
df_bart = pd.merge(df_xsum, df_bart, on='id')

In [56]:
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")
bertscore_metric = evaluate.load("bertscore")

In [107]:
def rougeScore(preds, refs):
    pred_tokens = ["\n".join(nltk.sent_tokenize(str(item).strip())) for item in preds]
    refs_tokens = ["\n".join(nltk.sent_tokenize(item.strip())) for item in refs]
    rogue_scores = rouge_metric.compute(predictions=pred_tokens, references=refs_tokens, use_stemmer=True)
    rouge_dict = {key: value * 100 for key, value in rogue_scores.items()}
    return rouge_dict

def bertScore(preds, refs):
    bert_scores = bertscore_metric.compute(predictions=preds, references=refs, lang="en",model_type="distilbert-base-uncased")
    bert_dict = {"generated": preds,"summary": refs,"P":bert_scores['precision'],"R":bert_scores['recall'],"F1":bert_scores['f1']}
    return bert_dict

def bleuScore(preds, refs):
    bleu_dic = {}
    bleu_dic['bleu-1-grams'] = corpus_bleu(refs, preds, weights=(1.0, 0, 0, 0))
    bleu_dic['bleu-1-2-grams'] = corpus_bleu(refs, preds, weights=(0.5, 0.5, 0, 0))
    bleu_dic['bleu-1-3-grams'] = corpus_bleu(refs, preds, weights=(0.3, 0.3, 0.3, 0))
    bleu_dic['bleu-1-4-grams'] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25))
    sacrebleu = bleu_metric.compute(predictions=preds, references=refs)
    bleu_dic['sacrebleu'] = sacrebleu['score']
    return bleu_dic

# Calculate T5 metric scores

In [91]:
generated_t5 = df_t5['generated'].values
actual_summaries = df_t5['summary'].values

In [71]:
# Calculate Rouge scores
rouge_scores = rougeScore(generated_t5, actual_summaries)
rouge_df = pd.DataFrame(rouge_scores, index=[0])
rouge_df

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,40.003396,16.598095,32.046278,32.033889


In [72]:
# Calculate Bert scores
bert_df = pd.DataFrame(bertScore(generated_t5, actual_summaries))
bert_df.head()

Unnamed: 0,generated,summary,P,R,F1
0,Prison leavers in Wales are struggling to find...,"There is a ""chronic"" need for more housing for...",0.931943,0.880803,0.905652
1,A man has appeared in court charged with armed...,"A man has appeared in court after firearms, am...",0.896071,0.874545,0.885178
2,Four teenagers accused of kidnapping a black t...,Four people accused of kidnapping and torturin...,0.856715,0.836354,0.846412
3,West Brom have sacked former England goalkeepi...,West Brom have appointed Nicky Hammond as tech...,0.780209,0.80926,0.794469
4,Scientists say they have shown a diet that mim...,The pancreas can be triggered to regenerate it...,0.824701,0.800168,0.812249


In [108]:
# Calculate Bleu scores
blue_df = pd.DataFrame(bleuScore(generated_t5, actual_summaries), index=[0])
blue_df

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,bleu-1-grams,bleu-1-2-grams,bleu-1-3-grams,bleu-1-4-grams,sacrebleu
0,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937


In [109]:
t5_df = pd.concat([rouge_df, blue_df], axis=1)
t5_df['bert_precision'] = np.mean(bert_df['P'].values)
t5_df['bert_recall'] = np.mean(bert_df['R'].values)
t5_df['bert_F1'] = np.mean(bert_df['F1'].values)
t5_df

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu-1-grams,bleu-1-2-grams,bleu-1-3-grams,bleu-1-4-grams,sacrebleu,bert_precision,bert_recall,bert_F1
0,44.943954,21.587879,36.508601,36.517399,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896


# Calculate BART metric scores

In [110]:
generated_bart = df_bart['generated'].values
actual_summaries = df_bart['summary'].values

In [115]:
rouge_scores = rougeScore(generated_t5, actual_summaries)
rouge_df = pd.DataFrame(rouge_scores, index=[0])
bert_df = pd.DataFrame(bertScore(generated_t5, actual_summaries))
blue_df = pd.DataFrame(bleuScore(generated_t5, actual_summaries), index=[0])
bart_df = pd.concat([rouge_df, blue_df], axis=1)
bart_df['bert_precision'] = np.mean(bert_df['P'].values)
bart_df['bert_recall'] = np.mean(bert_df['R'].values)
bart_df['bert_F1'] = np.mean(bert_df['F1'].values)
bart_df

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu-1-grams,bleu-1-2-grams,bleu-1-3-grams,bleu-1-4-grams,sacrebleu,bert_precision,bert_recall,bert_F1
0,40.003396,16.598095,32.046278,32.033889,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896


# Combine Metrics

In [116]:
df_merge = pd.concat([t5_df, bart_df], axis=0)
df_merge.index = ['T5-large-XSum', 'BART-large-XSum']
df_merge.to_csv("summary_metrics.csv")
df_merge

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu-1-grams,bleu-1-2-grams,bleu-1-3-grams,bleu-1-4-grams,sacrebleu,bert_precision,bert_recall,bert_F1
T5-large-XSum,44.943954,21.587879,36.508601,36.517399,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896
BART-large-XSum,40.003396,16.598095,32.046278,32.033889,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896


In [119]:
# pd.read_csv("summary_metrics.csv", index_col=0)

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu-1-grams,bleu-1-2-grams,bleu-1-3-grams,bleu-1-4-grams,sacrebleu,bert_precision,bert_recall,bert_F1
T5-large-XSum,44.943954,21.587879,36.508601,36.517399,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896
BART-large-XSum,40.003396,16.598095,32.046278,32.033889,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896
