### This notebook generates summaries for articles in test set using specified models, and calculates the ROUGE scores based on reference summary.

In [None]:
!pip install py-rouge
!pip install sentencepiece
!pip install transformers
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# model to use
model_type = 'bart'

# if model_path = None, load pretrained weights from Huggingface
model_path = '/content/drive/MyDrive/NLP Summarization Project/Finetuned Models/HP optimized/'

beam_size = 2
batch_size = 10

# path to the tokenized test articles
test_tensor_path = '/content/drive/MyDrive/NLP Summarization Project/Data/Test_Articles_BART.pt'

output_df_path = f'/content/drive/MyDrive/NLP Summarization Project/Generated Summaries/1f BART_finetuned on optimal hp_beam{beam_size}.csv'
test_df_path = '/content/drive/MyDrive/NLP Summarization Project/Data/Cleaned Test Data.csv'

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from time import time
import rouge
from time import time
import nltk
nltk.download('punkt')

device = torch.device("cuda")
print('GPU:', torch.cuda.get_device_name(0))

In [None]:
# load finetuned or pretrained weights

if model_type.lower() == 'bart':
    if not model_path:
        model_path = "facebook/bart-large-cnn"
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained(model_path)

elif model_type.lower() == 't5':
    if not model_path:
        model_path = 't5-base'
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    model = T5ForConditionalGeneration.from_pretrained(model_path)

elif model_type.lower() == 'pegasus':
    model_path = 'google/pegasus-newsroom'
    tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-newsroom')
    model = PegasusForConditionalGeneration.from_pretrained(model_path)

model = model.to(device)

articles_tokenized = torch.load(test_tensor_path)  

In [None]:
# generate summaries

start_time = time()

def generate_summary(articles):
    summary_list = []
    num_of_articles = articles.size()[0]

    for  start_row in range(0, num_of_articles, batch_size):
        if (num_of_articles - start_row) < batch_size:
            input_batch = articles[start_row:, :]
        else:
            input_batch = articles[start_row:start_row + batch_size, :]

        input_batch = input_batch.to(device)

        # specify pad_token_id instead of attention_mask because in the truncated conditions, the tokens are manually 
        # replaced using numpy, so the attention mask created during tokenization would not apply
        summary_batch = model.generate(input_ids = input_batch, pad_token_id = tokenizer.pad_token_id , num_beams = beam_size)
        summary_list += [tokenizer.decode(summary, skip_special_tokens = True, clean_up_tokenization_spaces=False) for summary in summary_batch]
        if start_row%5000 == 0:
            print(f'article {start_row}: time {round((time() - start_time)/60)}')
            t = time()

    return summary_list

summary_list = generate_summary(articles_tokenized)

print(round((time() - start_time)/3600))

In [None]:
test_df = pd.read_csv(test_df_path)
print(len(test_df))

# since the tokenized articles preserve the order on the test dataset csv, we can 
# simmply append the generated results as a new column
test_df['generated'] = summary_list

In [None]:
# evaluate the generated summary against the reference summary
evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'],
                           max_n=2,
                           limit_length=True,
                           length_limit=150,
                           length_limit_type='words',
                           apply_avg=False,
                           apply_best=False,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

t = time()

scores = evaluator.get_scores(test_df[f'generated'], test_df['summary'])

print( round(time() - t) )

# saved the scores as separate columns in the test dataframe
test_df['rouge1f'] = [x['f'][0] for x in scores['rouge-1']]
test_df['rouge2f'] = [x['f'][0] for x in scores['rouge-2']]
test_df['rougeLf'] = [x['f'][0] for x in scores['rouge-l']]
test_df['rouge1r'] = [x['r'][0] for x in scores['rouge-1']]
test_df['rouge2r'] = [x['r'][0] for x in scores['rouge-2']]
test_df['rougeLr'] = [x['r'][0] for x in scores['rouge-l']]
test_df['rouge1p'] = [x['p'][0] for x in scores['rouge-1']]
test_df['rouge2p'] = [x['p'][0] for x in scores['rouge-2']]
test_df['rougeLp'] = [x['p'][0] for x in scores['rouge-l']]

test_df.to_csv(output_df_path)