In [13]:
!pip install nltk transformers datasets




In [4]:
from datasets import load_dataset

# Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")


In [5]:
# Get 5% of the training dataset (about 60,000 records from ~200,000)
train_sampled = dataset['train'].train_test_split(test_size=0.05, seed=42)['test']
# Get 5% of the validation and test datasets
validation_sampled = dataset['validation'].train_test_split(test_size=0.05, seed=42)['test']
test_sampled = dataset['test'].train_test_split(test_size=0.05, seed=42)['test']


In [6]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load pre-trained BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")




In [20]:
# Summarize function with tuned generation parameters
def summarize_article(article_text):
    inputs = tokenizer(article_text, max_length=1024, return_tensors="pt", truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,          # Set max summary length
        min_length=60,           # Set min summary length
        length_penalty=2.0,      # Control length preference (>1 favors longer summaries)
        num_beams=6,             # More beams = higher quality, but slower
        no_repeat_ngram_size=3,  # Prevents repeating 3-grams
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [21]:
!pip install rouge_score
!pip install evaluate



In [22]:
import evaluate
# Initialize lists to hold generated and reference summaries
generated_summaries = []
reference_summaries = []

# Summarize and print original articles with their summaries
for i in range(5):  # Adjust the range to summarize more articles
    article = dataset['test'][i]['article']
    reference_summary = dataset['test'][i]['highlights']

    # Generate summary
    generated_summary = summarize_article(article)

    # Store the summaries
    generated_summaries.append(generated_summary)
    reference_summaries.append(reference_summary)

    # Print the original article, generated summary, and reference summary
    print(f"Original Article {i+1}:\n{article}\n")
    print(f"Generated Summary {i+1}:\n{generated_summary}\n")
    print("=" * 50)  # Separator for readability

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Compute ROUGE scores
results = rouge.compute(predictions=generated_summaries, references=reference_summaries)

# Display the results
print("ROUGE-1:", results["rouge1"])
print("ROUGE-2:", results["rouge2"])
print("ROUGE-L:", results["rougeL"])




Original Article 1:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wedn