In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Step 1: Preprocess Textual Data

## Step 1.1: Load the CNN/DailyMail Dataset

In this step, we are using the HuggingFace `datasets` library to load the CNN/Daily Mail dataset.
This dataset contains long news articles along with their human-written summaries ("highlights").
We'll use the "article" as input and "highlights" as target for our summarization task.

In [3]:
# Install HuggingFace Datasets Library (only required once in Kaggle)
!pip install -q datasets

# Import required library
from datasets import load_dataset

# Load the CNN/DailyMail dataset
# '3.0.0' version has cleaned text without unnecessary symbols
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Check the keys and number of samples in the train set
print("Dataset Keys:", dataset.keys())
print("Number of training samples:", len(dataset["train"]))

# Preview one example
example = dataset["train"][0]
print("\n📰 Article Sample:\n", example["article"][:500])  # print first 500 chars
print("\n📝 Summary:\n", example["highlights"])

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86_64", but you have nv

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset Keys: dict_keys(['train', 'validation', 'test'])
Number of training samples: 287113

📰 Article Sample:
 LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s

📝 Summary:
 Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


## Step 1.2: Clean and Preprocess the Text (Keep Punctuation)

We clean the raw article text while **keeping punctuation** so that sentence boundaries remain intact.
We perform:
- Lowercasing
- Removing HTML tags
- Removing digits and unwanted symbols (but keep punctuation)
- Removing extra whitespaces

In [4]:
import re

# Cleaning function that keeps punctuation and digits
def clean_text_keep_punctuation_and_digits(text):
    """
    Cleans the input text by:
    - Lowercasing
    - Removing HTML tags
    - Keeping digits
    - Keeping punctuation
    - Removing extra whitespace
    """
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)              # Remove HTML tags
    # ✅ Don't remove digits
    text = re.sub(r'\s+', ' ', text).strip()       # Remove extra whitespace
    return text

# 🧪 Test on a sample
raw_text = dataset["train"][0]["article"]
cleaned_text = clean_text_keep_punctuation_and_digits(raw_text)

print("Before Cleaning:\n", raw_text[:500])
print("\nAfter Cleaning:\n", cleaned_text[:500])

Before Cleaning:
 LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s

After Cleaning:
 london, england (reuters) -- harry potter star daniel radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on monday, but he insists the money won't cast a spell on him. daniel radcliffe as harry potter in "harry potter and the order of the phoenix" to the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "i don't pla

## Step 1.3: Tokenization using spaCy

In this step, we will tokenize the cleaned article text into:
- Sentences: useful for extractive summarization
- Words: may be useful for analysis or model inputs

We use the `spaCy` library for natural language processing tasks like tokenization.

In [5]:
# ⚙️ Install spaCy and English model
!pip install -q spacy
!python -m spacy download en_core_web_sm

# 📚 Import and load spaCy English model
import spacy
nlp = spacy.load("en_core_web_sm")

# 🧼 Cleaned article from previous step
cleaned_text = clean_text_keep_punctuation_and_digits(dataset["train"][0]["article"])

# 🧠 Apply spaCy NLP pipeline
doc = nlp(cleaned_text)

# 🧾 Sentence Tokenization
sentences = [sent.text for sent in doc.sents]
print(f"Total Sentences: {len(sentences)}")
print("\n🧾 First 3 Sentences:\n", sentences[:3])

# 🔤 Word Tokenization (for one sentence)
words_in_first_sentence = [token.text for token in doc.sents.__iter__().__next__()]
print("\n🔤 Words in First Sentence:\n", words_in_first_sentence)

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Total Sentences: 24

🧾 First 3 Sentences:
 ["london, england (reuters) -- harry potter star daniel radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on monday, but he insists the money won't cast a spell on him.", 'daniel radcliffe as harry potter in "harry potter and the order o

## Step 1.4: Save Cleaned & Tokenized Text

To make our workflow easier and reusable, we save the following in a structured format:
- Original article
- Cleaned article (lowercased, punctuation kept)
- Sentence tokens (for extractive methods)
- Summary (highlights)

We'll store a few samples into a Pandas DataFrame and optionally save it as a CSV for reuse.

In [6]:
import pandas as pd

# 📥 How many examples to store? (you can increase this later)
num_samples = 100

# 📦 Prepare list of samples
data_list = []

for i in range(num_samples):
    original = dataset["train"][i]["article"]
    summary = dataset["train"][i]["highlights"]
    
    # Clean text
    cleaned = clean_text_keep_punctuation_and_digits(original)
    
    # Tokenize into sentences
    doc = nlp(cleaned)
    sentence_tokens = [sent.text for sent in doc.sents]
    
    # Append as dictionary
    data_list.append({
        "original_article": original,
        "cleaned_article": cleaned,
        "sentences": sentence_tokens,
        "summary": summary
    })

# 🐼 Convert to DataFrame
df = pd.DataFrame(data_list)

# 👀 Show one row
print(df.iloc[0])

# 💾 Save to CSV (optional)
df.to_csv("cleaned_cnn_dailymail_subset.csv", index=False)
print("\n✅ Saved 100 cleaned examples to CSV.")

original_article    LONDON, England (Reuters) -- Harry Potter star...
cleaned_article     london, england (reuters) -- harry potter star...
sentences           [london, england (reuters) -- harry potter sta...
summary             Harry Potter star Daniel Radcliffe gets £20M f...
Name: 0, dtype: object

✅ Saved 100 cleaned examples to CSV.


# Step 2: Extractive Summarization using spaCy

## Step 2.1: Sentence Scoring using Word Frequency

In extractive summarization, we rank each sentence based on how many **important words** it contains.
We use this approach:
- Count frequency of each word in the article
- Score each sentence by summing the frequency of its words
- Later, select top `n` scored sentences as the summary

This approach is fast, simple, and works well for extractive summaries.

In [7]:
from collections import defaultdict
import heapq

# 🧪 Use the first cleaned article and sentence list
cleaned_article = df.iloc[0]["cleaned_article"]
sentences = df.iloc[0]["sentences"]

# 📊 Step 1: Calculate word frequency (excluding stopwords and punctuation)
from spacy.lang.en.stop_words import STOP_WORDS

word_freq = defaultdict(int)

# Tokenize the full article into words using spaCy
doc = nlp(cleaned_article)

for token in doc:
    if token.text.lower() not in STOP_WORDS and token.text.isalpha():
        word_freq[token.text.lower()] += 1

# 🔍 Normalize frequencies (optional but better)
max_freq = max(word_freq.values())
for word in word_freq:
    word_freq[word] = word_freq[word] / max_freq

# ✅ Step 2: Score each sentence
sentence_scores = {}

for sent in sentences:
    sent_doc = nlp(sent)
    score = 0
    for token in sent_doc:
        word = token.text.lower()
        if word in word_freq:
            score += word_freq[word]
    sentence_scores[sent] = score

# 👑 Step 3: Select top N sentences (e.g., top 3)
top_n = 3
summary_sentences = heapq.nlargest(top_n, sentence_scores, key=sentence_scores.get)

# 🔡 Join the selected sentences into a summary
extractive_summary = ' '.join(summary_sentences)

# 📢 Print the result
print("📰 Original Summary (Human):\n", df.iloc[0]["summary"])
print("\n🤖 Extractive Summary (Top 3 Sentences):\n", extractive_summary)

📰 Original Summary (Human):
 Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .

🤖 Extractive Summary (Top 3 Sentences):
 daniel radcliffe as harry potter in "harry potter and the order of the phoenix" to the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. london, england (reuters) -- harry potter star daniel radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on monday, but he insists the money won't cast a spell on him. his latest outing as the boy wizard in "harry potter and the order of the phoenix" is breaking records on both sides of the atlantic and he will reprise the role in the last two films.


## Step 2.2: Wrap Extractive Summarizer into a Function

To reuse our summarization logic, we define a function that:
1. Takes a cleaned article and sentence list as input
2. Calculates word frequency
3. Scores each sentence
4. Returns top `n` sentences as the summary

This makes it easy to run summarization on multiple samples or datasets.

In [8]:
from collections import defaultdict
import heapq
from spacy.lang.en.stop_words import STOP_WORDS

# 🔁 Extractive summarizer function
def extractive_summary_spacy(cleaned_text, sentences, top_n=3):
    """
    Performs extractive summarization by:
    - Calculating word frequency
    - Scoring each sentence
    - Selecting top N scored sentences
    """
    # Load spaCy model
    doc = nlp(cleaned_text)
    
    # ✅ Step 1: Word frequency
    word_freq = defaultdict(int)
    for token in doc:
        if token.text.lower() not in STOP_WORDS and token.text.isalpha():
            word_freq[token.text.lower()] += 1

    # Normalize frequency
    max_freq = max(word_freq.values())
    for word in word_freq:
        word_freq[word] = word_freq[word] / max_freq

    # ✅ Step 2: Score each sentence
    sentence_scores = {}
    for sent in sentences:
        sent_doc = nlp(sent)
        score = 0
        for token in sent_doc:
            word = token.text.lower()
            if word in word_freq:
                score += word_freq[word]
        sentence_scores[sent] = score

    # ✅ Step 3: Top N sentences
    summary_sentences = heapq.nlargest(top_n, sentence_scores, key=sentence_scores.get)
    
    # ✅ Join summary
    summary = ' '.join(summary_sentences)
    return summary

In [9]:
# Test the function:

sample_cleaned = df.iloc[0]["cleaned_article"]
sample_sentences = df.iloc[0]["sentences"]

generated_summary = extractive_summary_spacy(sample_cleaned, sample_sentences, top_n=3)

print("🧠 Generated Extractive Summary:\n", generated_summary)

🧠 Generated Extractive Summary:
 daniel radcliffe as harry potter in "harry potter and the order of the phoenix" to the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. london, england (reuters) -- harry potter star daniel radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on monday, but he insists the money won't cast a spell on him. his latest outing as the boy wizard in "harry potter and the order of the phoenix" is breaking records on both sides of the atlantic and he will reprise the role in the last two films.


## Step 2.3: Apply Extractive Summarizer to Multiple Articles

Now that we have a working extractive summarization function, we'll apply it to multiple articles.
We'll:
- Loop over a subset of cleaned data
- Generate extractive summaries
- Store original, human-written, and extractive summaries side-by-side

This helps us analyze how well the extractive summarizer is performing.

In [10]:
# 🔁 Apply summarizer to multiple samples
summary_data = []

# 🔢 You can change how many samples to process
num_samples = 5

for i in range(num_samples):
    cleaned = df.iloc[i]["cleaned_article"]
    sentences = df.iloc[i]["sentences"]
    human_summary = df.iloc[i]["summary"]
    
    # Extractive summary
    generated_summary = extractive_summary_spacy(cleaned, sentences, top_n=3)
    
    summary_data.append({
        "original_article": df.iloc[i]["original_article"][:500] + "...",  # shorten for readability
        "human_summary": human_summary,
        "extractive_summary": generated_summary
    })

# Convert to DataFrame for display
summary_df = pd.DataFrame(summary_data)

# 🔍 Show results
summary_df[["human_summary", "extractive_summary"]]

Unnamed: 0,human_summary,extractive_summary
0,Harry Potter star Daniel Radcliffe gets £20M f...,"daniel radcliffe as harry potter in ""harry pot..."
1,Mentally ill inmates in Miami are housed on th...,leifman says in 1955 there were more than half...
2,"NEW: ""I thought I was going to die,"" driver sa...","""the whole bridge from one side of the mississ..."
3,"Five small polyps found during procedure; ""non...",washington (cnn) -- doctors removed five small...
4,"NEW: NFL chief, Atlanta Falcons owner critical...","in an additional summary of facts, signed by v..."


# Step 3: Abstractive Summarization using Transformers

Now we'll generate summaries using a pre-trained transformer model (like BART or T5) from Hugging Face 🤗.
Unlike extractive methods, these models rewrite and rephrase content to form new, human-like summaries.

We'll:
- Load a pretrained summarization model
- Tokenize and summarize a few articles
- Compare with the original summaries

## Step 3.1: Load BART Pretrained Model & Tokenizer

We'll use Hugging Face Transformers to load the BART model, specifically:
- Model: `facebook/bart-large-cnn`
- Purpose: News/article summarization
- Framework: PyTorch backend

This model takes in a long article and generates a short, human-like summary.

In [11]:
!pip install transformers --quiet

# ✅ Import required Hugging Face libraries
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# ✅ Load the pretrained BART model and tokenizer
model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# ✅ Set device to GPU if available for faster inference
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print("Model and tokenizer loaded successfully on", device)

2025-05-24 06:33:53.097242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748068433.360707      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748068433.437440      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Model and tokenizer loaded successfully on cpu


## Step 3.2: Generate Abstractive Summary Using BART (Single Article)

We will now:
1. Tokenize a cleaned article
2. Generate a summary using the BART model
3. Decode and print the result

This demonstrates how the transformer understands the full article and creates a new, human-like summary.

In [12]:
# 🧪 Pick one sample article from our cleaned dataframe
sample_article = df.iloc[0]["cleaned_article"]

# 🔁 Tokenize the input text
inputs = tokenizer.encode(sample_article, return_tensors="pt", max_length=1024, truncation=True)

# ✅ Move input to same device as model
inputs = inputs.to(device)

# 🧠 Generate summary (beam search gives better results)
summary_ids = model.generate(
    inputs,
    num_beams=4,             # Try 4 beams for better quality
    length_penalty=2.0,      # Shorter summary preference
    max_length=150,          # Maximum tokens in summary
    min_length=40,           # Minimum tokens in summary
    early_stopping=True
)

# 📝 Decode the summary
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("📄 Abstractive Summary:\n", summary_text)

📄 Abstractive Summary:
 london, england (reuters) -- harry potter star daniel radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on monday, but he insists the money won't cast a spell on him. london -- england "harry potter and the order of the phoenix" to the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "i don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an australian interviewer


## Step 3.3: Abstractive Summarization for Multiple Articles

We'll now apply our BART summarizer to multiple articles.

For each article, we will:
- Run the abstractive summarizer
- Store original article, human summary, extractive summary, and BART-generated summary

This comparison helps us assess the model's ability to produce accurate, readable summaries.

In [13]:
abstractive_results = []

# 🔢 Number of articles to summarize
num_samples = 5

for i in range(num_samples):
    cleaned_article = df.iloc[i]["cleaned_article"]
    original_article = df.iloc[i]["original_article"]
    human_summary = df.iloc[i]["summary"]
    extractive_summary = extractive_summary_spacy(cleaned_article, df.iloc[i]["sentences"], top_n=3)
    
    # Tokenize & generate using BART
    inputs = tokenizer.encode(cleaned_article, return_tensors="pt", max_length=1024, truncation=True).to(device)
    
    summary_ids = model.generate(
        inputs,
        num_beams=4,
        length_penalty=2.0,
        max_length=150,
        min_length=40,
        early_stopping=True
    )
    
    bart_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Store all summaries
    abstractive_results.append({
        "original_article": original_article[:500] + "...",  # limit for readability
        "human_summary": human_summary,
        "extractive_summary": extractive_summary,
        "abstractive_summary": bart_summary
    })

# 📊 Create DataFrame for display
abstractive_df = pd.DataFrame(abstractive_results)

# 📌 View comparison
abstractive_df[["human_summary", "extractive_summary", "abstractive_summary"]]

Unnamed: 0,human_summary,extractive_summary,abstractive_summary
0,Harry Potter star Daniel Radcliffe gets £20M f...,"daniel radcliffe as harry potter in ""harry pot...","london, england (reuters) -- harry potter star..."
1,Mentally ill inmates in Miami are housed on th...,leifman says in 1955 there were more than half...,editor's note: in our behind the scenes series...
2,"NEW: ""I thought I was going to die,"" driver sa...","""the whole bridge from one side of the mississ...","minneapolis, minnesota (cnn) -- drivers who we..."
3,"Five small polyps found during procedure; ""non...",washington (cnn) -- doctors removed five small...,washington (cnn) -- doctors removed five small...
4,"NEW: NFL chief, Atlanta Falcons owner critical...","in an additional summary of facts, signed by v...",(cnn) -- the national football league has inde...


# Step 4: Fine-Tune the Abstractive Model on CNN/DailyMail

Although the pretrained BART model gives decent results, we can improve its performance further by fine-tuning it on our dataset.

We'll:
- Prepare (article, summary) pairs for training
- Tokenize the data in model-friendly format
- Use Hugging Face's `Trainer` API for easy fine-tuning
- Train the model for 1–2 epochs

## Step 4.1: Prepare Dataset for Fine-Tuning

To fine-tune the BART model, we need to:
- Use (article, summary) pairs from the CNN/DailyMail dataset
- Format them for Hugging Face Trainer API
- Tokenize articles and summaries for model input

We'll use a small subset for demonstration (5–100 samples), since Kaggle has limited compute.

In [14]:
# ✅ Select a small subset of dataset for demo fine-tuning (e.g., 100 samples)
train_size = 100  # You can reduce this if Kaggle runs out of memory
train_articles = df["cleaned_article"][:train_size].tolist()
train_summaries = df["summary"][:train_size].tolist()

# ✅ Combine into a list of dictionaries
train_data = [{"article": art, "summary": summ} for art, summ in zip(train_articles, train_summaries)]

# 🧪 Show a sample
print("Sample pair:\n")
print("Article:", train_data[0]["article"][:300], "...\n")
print("Summary:", train_data[0]["summary"])

Sample pair:

Article: london, england (reuters) -- harry potter star daniel radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on monday, but he insists the money won't cast a spell on him. daniel radcliffe as harry potter in "harry potter and the order of the phoenix" to the disappoi ...

Summary: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


## Step 4.2: Tokenize Articles and Summaries

We now tokenize the input (article) and output (summary) text using BART's tokenizer.

Each sample becomes:
- `input_ids`: Tokenized article
- `labels`: Tokenized summary
- `attention_mask`: To tell the model which tokens are real vs padding

This is essential before training.

In [15]:
from torch.utils.data import Dataset

# ✅ Custom Dataset Class for Hugging Face Trainer
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_len=512, max_output_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        article = item["article"]
        summary = item["summary"]

        # 🔠 Tokenize article (input)
        inputs = tokenizer(
            article,
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # 🏷️ Tokenize summary (label)
        labels = tokenizer(
            summary,
            max_length=self.max_output_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Return everything as a dictionary
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze()
        }

# ✅ Create dataset
train_dataset = SummarizationDataset(train_data, tokenizer)

# 🧪 Test
sample = train_dataset[0]
print("Input IDs:\n", sample["input_ids"][:20])
print("Labels:\n", sample["labels"][:20])

Input IDs:
 tensor([    0,   462, 24639,     6, 20407,  1245,    36,   241, 13188,    43,
          480, 12280,  1506,  4728,  1334,   999,   385, 35947, 13206, 20152])
Labels:
 tensor([    0, 29345, 10997,   999,  3028,  7312, 20152,  1516,   984,   844,
          448, 13016,    25,    37,  4072,   504,   302,   479, 50118, 22138])


## Step 4.3: Fine-Tune the BART Model

Now we define training parameters and start fine-tuning the model using Hugging Face's Trainer API.

We'll set:
- Number of epochs (e.g., 2)
- Batch size (e.g., 4 or 8 depending on GPU memory)
- Learning rate and other optimizer settings

In [16]:
from transformers import Trainer, TrainingArguments

# ✅ Define training arguments
training_args = TrainingArguments(
    output_dir='./bart-finetuned',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    remove_unused_columns=False,
    fp16=torch.cuda.is_available()
)

# ✅ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# ✅ Start fine-tuning
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
10,11.2969
20,11.0999
30,10.3168
40,9.6787
50,9.0759




TrainOutput(global_step=50, training_loss=10.293632965087891, metrics={'train_runtime': 547.2787, 'train_samples_per_second': 0.365, 'train_steps_per_second': 0.091, 'total_flos': 60973645824000.0, 'train_loss': 10.293632965087891, 'epoch': 2.0})

## Step 4.4 – Inference with Fine-Tuned BART Model

Now that our model is trained, we will use it to generate summaries on unseen articles from the test set. We'll compare these generated summaries with the reference (gold) summaries provided in the dataset.

We'll pick a few samples and summarize them using the `generate()` method.

In [19]:
from transformers import pipeline

# ✅ Load pipeline for summarization using the fine-tuned model
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=-1
)

# ✅ Pick some test samples to summarize
for i in range(20):
    article = dataset["test"][i]["article"][:1024]  # 🧠 truncate long articles
    reference = dataset["test"][i]["highlights"]

    summary = summarizer(
        article,
        max_new_tokens=60,
        min_length=30,
        do_sample=False,
        truncation=True
    )[0]["summary_text"]


    print(f"\n🔹 Article #{i+1}:\n", article[:500], "...\n")
    print(f"🟢 Generated Summary:\n{summary}\n")
    print("="*100)

Device set to use cpu



🔹 Article #1:
 (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin ...

🟢 Generated Summary:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The


🔹 Article #2:
 (CNN)Never mind cats having nine lives. A stray pooch in Washington State has used up at least three of her own after being

# Test the Model on Real-World Articles and Evaluate Summary Coherence

## Step 5.1 – Select a few real-world news articles (outside the dataset)
🎯 Goal:
We’ll manually pick 2–3 real-world news articles (not from the CNN/DailyMail dataset) and feed them to our fine-tuned BART model to check how well it summarizes.

In [20]:
real_world_articles = [
    """A federal judge has temporarily halted the Trump administration’s ban on Harvard University’s ability to enroll international students.

US District Court Judge Allison Burroughs ruled hours after the nation’s oldest and wealthiest college filed suit Friday. Harvard argued revocation of its certification in the Student and Exchange Visitor Program was “clear retaliation” for its refusal of the government’s ideologically rooted policy demands.

Burroughs is the same judge considering a separate lawsuit from Harvard challenging the administration’s freeze of $2.65 billion in federal funding.

Harvard’s latest complaint argues the decision Thursday to drop the school from the Department of Homeland Security’s SEVP system violates the law.

“It is the latest act by the government in clear retaliation for Harvard exercising its First Amendment rights to reject the government’s demands to control Harvard’s governance, curriculum, and the ‘ideology’ of its faculty and students,” the complaint states.

Burroughs, an Obama appointee, said in her order Harvard had shown “it will sustain immediate and irreparable injury” if government were allowed to revoke the school’s certification before the court could consider the matter.

A remote conference in the case is set for Tuesday. Two days later, the judge is due to hear arguments at the federal courthouse in Boston over whether to issue a preliminary injunction – an order that would block the administration’s action until a final decision is made in the lawsuit.

The Trump administration’s revocation of Harvard’s ability to enroll international students came as sharp punishment to the elite institution for refusing to bow to White House policy demands. Rooted in political ideology, the requirements – such as handing over student disciplinary records and killing equity initiatives – also have been placed on other US colleges.

“Harvard can no longer enroll foreign students and existing foreign students must transfer or lose their legal status,” the US Department of Homeland Security said in a statement.
    """,

    """Russia and Ukraine have completed the first phase of what is expected to be the biggest prisoner exchange since the start of the war, with almost 800 people released on Friday.

The swap started on Friday and will continue on Saturday and Sunday, with Kyiv and Moscow expected to swap 2,000 people – 1,000 from each side.

The agreement to release 1,000 prisoners on each side was the only significant outcome of the meeting between Kyiv and Moscow in Istanbul last week, which marked the first time the two sides have met directly since soon after Russia’s full-scale unprovoked invasion of Ukraine in February 2022.

“We are bringing our people home,” Ukraine’s President Volodymyr Zelensky said on X, adding that 390 people arrived back to Ukraine on Friday. He said the group included 270 military and 120 civilians.

The Ukrainian Coordination Center for Treatment of Prisoners of War said three women and 387 men were among those released on Friday.

The Russian Ministry of Defense said in a statement that “270 Russian servicemen and 120 civilians” were returned to Russia. It said the civilians were captured by Ukrainian troops in Kursk, the Russian region to which Ukraine launched a surprise incursion last summer. Russia has since reclaimed most of the territory.

However, Zelensky said later that the Russian civilians returned by Ukraine were “Russian saboteurs and collaborators” who were arrested by Ukrainian law enforcement officers.

As in previous exchanges, the released prisoners were brought to a meeting place in several buses after being released by Russia at the Ukrainian border. Many were given Ukrainian flags and bracelets in Ukrainian colours at the border.
    """
]

In [22]:
from transformers import pipeline
import torch

# Ensure the model is on the right device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Create summarization pipeline with fine-tuned model
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=-1
)

# Inference
for i, article in enumerate(real_world_articles):
    summary = summarizer(article, max_new_tokens=256, truncation=True)[0]['summary_text']
    print(f"🔹 Article #{i+1}:\n{article[:400]}...\n")  # printing a snippet of the article
    print(f"🟢 Generated Summary:\n{summary}\n")
    print("="*100)

Device set to use cpu


🔹 Article #1:
A federal judge has temporarily halted the Trump administration’s ban on Harvard University’s ability to enroll international students.

US District Court Judge Allison Burroughs ruled hours after the nation’s oldest and wealthiest college filed suit Friday. Harvard argued revocation of its certification in the Student and Exchange Visitor Program was “clear retaliation” for its refusal of the gov...

🟢 Generated Summary:
A federal judge has temporarily halted the Trump administration’s ban on enrolling international students.Harvard is seeking a preliminary injunction to stop the administration from blocking the decision Thursday.US District Court Judge Judge Allison Burroughs ruled in favor of the Obama administration.The U.S. Department of Homeland Security sued the Obama Administration on Thursday. Harvard argued revocation of its certification in the Student and Exchange Visitor Program was “clear retaliation” for its refusal of the government to comply with its ideo

## Substep 5.2 – Evaluate the Summaries: Coherence, Factuality, and Fluency
Here, we’ll critically evaluate the generated summaries from your model on three key aspects for each article:

✅ Evaluation Criteria:
Coherence: Is the summary logically organized and easy to follow?

Factuality: Does it preserve factual accuracy compared to the original article?

Fluency: Is the language natural, grammatical, and readable?

# 🔹 Article 1 – Harvard vs Trump Administration
🟢 Generated Summary:
A federal judge has temporarily halted the Trump administration’s ban on Harvard University‘s ability to enroll international students. It is the first such decision in a federal court since President Donald Trump took office in January. [...] “The decision to revoke the certification is a clear retaliation for Harvard exercising its First Amendment rights to reject the government's demands to control Harvard’'s governance, curriculum, and the ‘ideology’ of its faculty and students, and will lead to irreparable injury to [...]

## ✅ Evaluation:
Criterion	Evaluation
Coherence	❌ Low. The summary includes repeated ideas, has abrupt topic jumps, and ends mid-sentence.
Factuality	⚠️ Partially Inaccurate. It falsely says "first decision since Trump took office" (Trump took office in 2017). Also mixes claims that don’t appear in the article, e.g., “to the Obama administration.”
Fluency	❌ Poor. There are duplicated phrases, grammar errors (“government’ ideologically”), and unfinished sentences.

🧠 Overall: This summary is not reliable. It includes hallucinated facts, broken grammar, and lacks structure. The model over-generated tokens and started repeating and mixing ideas — a common issue when max_new_tokens is set too high without cleanup.

# 🔹 Article 2 – Russia-Ukraine Prisoner Swap
🟢 Generated Summary:
Russia and Ukraine have completed the first phase of what is expected to be the biggest prisoner exchange since the start of the war, with almost 800 people released on Friday.“We are bringing our people home,” Ukrainians said. [...] Ukraine and Russia have agreed to send 1,500 prisoners to the United States. [...] Ukrainian President Petro Poroshenko said on Friday...

## ✅ Evaluation:
Criterion	Evaluation
Coherence	❌ Low. Sentences jump between thoughts randomly; duplication is evident (“1,1,000 prisoners”).
Factuality	❌ Wrong. There is no mention of prisoners being sent to the U.S. in the article. It also wrongly names Petro Poroshenko (Ukraine’s president is Volodymyr Zelensky).
Fluency	❌ Poor. Repetition, grammar errors, and abrupt phrasing again. Example: "They are bringing their people home. We are bringing them home” — sounds unnatural and redundant.

🧠 Overall: This summary is factually flawed, with major hallucinations (sending to U.S., wrong president), and it’s repetitive and messy.

## 🧪 What’s the Conclusion?
Your fine-tuned BART model is struggling with:

* Hallucination (generating false info not in the article),

* Token repetition, and

* Incomplete or incoherent sentence endings.

These are common fine-tuning pitfalls, especially when:

* Training data isn’t cleaned properly,

* Model is overfitting to short training examples,

* Token limits or generation constraints aren’t set well.



## Substep 5.3 – Fix & Improve Generation Settings
Your current model is hallucinating, repeating phrases, and generating incomplete or messy summaries.

So in this substep, we’ll fine-tune the generation configuration to get cleaner, more accurate summaries.

In [23]:
for i, article in enumerate(real_world_articles):
    inputs = tokenizer(
        article, 
        return_tensors="pt", 
        truncation=True, 
        padding="longest", 
        max_length=1024
    ).to(device)

    # ✅ Updated generation settings here:
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=100,
        min_length=30,
        num_beams=4,
        repetition_penalty=1.2,
        length_penalty=1.0,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print(f"\n🔹 Article #{i+1}:\n{article[:300]}...\n")
    print(f"🟢 Improved Summary:\n{summary}")
    print("="*100)


🔹 Article #1:
A federal judge has temporarily halted the Trump administration’s ban on Harvard University’s ability to enroll international students.

US District Court Judge Allison Burroughs ruled hours after the nation’s oldest and wealthiest college filed suit Friday. Harvard argued revocation of its certific...

🟢 Improved Summary:
A federal judge has temporarily halted the Trump administration’s ban on Harvard University‘s ability to enroll international students.“It is clear retaliation” to Harvard for its refusal to accept foreign students.US District Court Judge Allison Burroughs ruled a preliminary injunction against the decision Thursday.The ruling came after Harvard filed suit Friday. Harvard argued revocation of its certification in the Student and Exchange Visitor Program was “clear retribution” for rejecting White House demands. “

🔹 Article #2:
Russia and Ukraine have completed the first phase of what is expected to be the biggest prisoner exchange since the start of t

# Step 6: Evaluation & Metrics
We'll calculate ROUGE scores to check how close your model’s summaries are to the human-written reference summaries.

## Substep 6.1 – Install the evaluate library
We will use HuggingFace's evaluate library to calculate ROUGE, which is the most widely used metric for summarization.

In [24]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


## Substep 6.2 – Import libraries & load ROUGE metric
We’ll now import what we need and load the ROUGE metric. 

In [25]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4f69491d3083048b66bdae33293d807116842cd5940e1d37c127665fb55046e5
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [26]:
import evaluate

# ✅ Load ROUGE metric
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Substep 6.3 – Prepare Data & Compute ROUGE Scores for Evaluation
We'll compare your model’s generated summaries against the reference summaries using the ROUGE metric (which is standard for summarization tasks).

🧠 What is ROUGE?
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) measures overlap between:

Words/phrases in generated summary ✅

Words/phrases in actual summary (reference) 📘

We’ll use:

rouge1 (unigram overlap)

rouge2 (bigram overlap)

rougeL (longest common subsequence)

In [28]:
# ✅ Prepare lists of generated and reference summaries
generated_summaries = []
reference_summaries = []

# 🧪 We'll evaluate on first 20 test articles
for i in range(20):
    article = dataset["test"][i]["article"][:1024]  # 🧠 truncate long articles
    reference = dataset["test"][i]["highlights"]

    summary = summarizer(
        article,
        max_new_tokens=60,
        min_length=30,
        do_sample=False,
        truncation=True
    )[0]["summary_text"]

    generated_summaries.append(summary)
    reference_summaries.append(reference)

# ✅ Compute ROUGE
results = rouge.compute(
    predictions=generated_summaries,
    references=reference_summaries
)

# 🎉 Show results
print("📊 ROUGE Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

📊 ROUGE Evaluation Results:
rouge1: 0.2832
rouge2: 0.1016
rougeL: 0.1919
rougeLsum: 0.2348
