In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
import torch
import pickle
from rouge_score import rouge_scorer

In [None]:
# For DistilBART model
tokenizer_bart = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model_bart = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

In [None]:
# T5 tokenizer and model
tokenizer_t5 = T5Tokenizer.from_pretrained('t5-small')
model_t5 = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
# Loading CNN/DM datasets
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Subset of the first 100 articles from the test split
subset_of_dataset = []
for i in range(100):  # Range can be adjusted based on the number of articles wanted
    subset_of_dataset.append(dataset["test"][i])

In [None]:
# Choosing article from the dataset
article_text = dataset["test"][0]["article"]

In [None]:
# Function to generate summaries using DistilBART
def generate_summary_bart(article_text):
    # Tokenize the input text and convert to tensor
    inputs = tokenizer_bart([article_text], return_tensors="pt", max_length=512, truncation=True)
    # Generate summary IDs using the model
    summary_ids = model_bart.generate(inputs["input_ids"], 
                                    max_length=150, 
                                    min_length=40, 
                                    length_penalty=2.0, 
                                    num_beams=4, 
                                    early_stopping=True)
    # Decode the generated IDs to text
    return tokenizer_bart.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
# Function to generate summaries using T5
def generate_summary_t5(article_text):
    # Tokenize the input text and convert to tensor
    inputs = tokenizer_t5("summarize: " + article_text, return_tensors="pt", max_length=512, truncation=True)
    # Generate summary IDs using the model
    summary_ids = model_t5.generate(inputs["input_ids"], 
                                    max_length=150, 
                                    min_length=40, 
                                    length_penalty=2.0, 
                                    num_beams=4, 
                                    early_stopping=True)
    # Decode the generated IDs to text
    return tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
#Generate summaries
num_articles = 100  # Number of articles to summarize
distilbart_summaries = []
t5_summaries = []
reference_summaries = [] #IInitialize list to store reference summaries

for i in range(num_articles):
    article_data = dataset["test"][i]
    article = article_data["article"]
    reference_summary = article_data["highlights"]
    
    summary_bart = generate_summary_bart(article)
    distilbart_summaries.append(summary_bart)
    
    summary_t5 = generate_summary_t5(article)
    t5_summaries.append(summary_t5)

    reference_summaries.append(reference_summary)

In [None]:
# Storing the summaries
import json
with open('distilbart_summaries.json', 'w') as f:
    json.dump(distilbart_summaries, f)

with open('t5_summaries.json', 'w') as f:
    json.dump(t5_summaries, f)

In [None]:
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate ROUGE scores for a list of generated summaries against their reference summaries
def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge_scores = []
    for i in range(len(generated_summaries)):
        generated_summary = generated_summaries[i]
        reference_summary = reference_summaries[i]
        score = scorer.score(reference_summary, generated_summary)
        rouge_scores.append(score)
    return rouge_scores

# Calculate ROUGE scores for DistilBART and T5 summaries
rouge_scores_distilbart = calculate_rouge_scores(distilbart_summaries, reference_summaries)
rouge_scores_t5 = calculate_rouge_scores(t5_summaries, reference_summaries)

# Save ROUGE scores for all summaries
rouge_scores_all_distilbart = calculate_rouge_scores(distilbart_summaries, reference_summaries)
rouge_scores_all_t5 = calculate_rouge_scores(t5_summaries, reference_summaries)


In [None]:
# Access ROUGE scores for DistilBART summaries
for i, scores in enumerate(rouge_scores_all_distilbart):
    print(f"ROUGE scores for DistilBART summary {i + 1}:")
    print(f"ROUGE-1 Precision: {scores['rouge1'].precision}")
    print(f"ROUGE-1 Recall: {scores['rouge1'].recall}")
    print(f"ROUGE-1 F1 Score: {scores['rouge1'].fmeasure}")
    print(f"ROUGE-2 Precision: {scores['rouge2'].precision}")
    print(f"ROUGE-2 Recall: {scores['rouge2'].recall}")
    print(f"ROUGE-2 F1 Score: {scores['rouge2'].fmeasure}")
    print(f"ROUGE-L Precision: {scores['rougeL'].precision}")
    print(f"ROUGE-L Recall: {scores['rougeL'].recall}")
    print(f"ROUGE-L F1 Score: {scores['rougeL'].fmeasure}")
    print("\n")

# Access ROUGE scores for T5 summaries
for i, scores in enumerate(rouge_scores_all_t5):
    print(f"ROUGE scores for T5 summary {i + 1}:")
    print(f"ROUGE-1 Precision: {scores['rouge1'].precision}")
    print(f"ROUGE-1 Recall: {scores['rouge1'].recall}")
    print(f"ROUGE-1 F1 Score: {scores['rouge1'].fmeasure}")
    print(f"ROUGE-2 Precision: {scores['rouge2'].precision}")
    print(f"ROUGE-2 Recall: {scores['rouge2'].recall}")
    print(f"ROUGE-2 F1 Score: {scores['rouge2'].fmeasure}")
    print(f"ROUGE-L Precision: {scores['rougeL'].precision}")
    print(f"ROUGE-L Recall: {scores['rougeL'].recall}")
    print(f"ROUGE-L F1 Score: {scores['rougeL'].fmeasure}")
    print("\n")