In [1]:

# STEP 1: Install necessary packages for Colab
!pip install -U fsspec datasets --no-cache-dir --quiet
!pip install transformers --quiet
!python -m spacy download en_core_web_sm

# STEP 2: Load CNN/DailyMail dataset using streaming to avoid fsspec bug
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0", streaming=True)
train_iter = iter(dataset['train'])

# STEP 3: Get one sample
sample = next(train_iter)
article = sample['article']
summary = sample['highlights']

print("ARTICLE SAMPLE:\n", article[:500])
print("\nREFERENCE SUMMARY:\n", summary)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m419.8/491.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m232.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine ==

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

ARTICLE SAMPLE:
 LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s

REFERENCE SUMMARY:
 Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [2]:

# STEP 4: Generate abstractive summary using HuggingFace Transformers
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
generated_summary = summarizer(article[:1024], max_length=130, min_length=30, do_sample=False)[0]['summary_text']

print("\nGENERATED SUMMARY:\n", generated_summary)


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu



GENERATED SUMMARY:
 Harry Potter star Daniel Radcliffe turns 18 on Monday. He gains access to a reported £20 million ($41.1 million) fortune. Radcliffe says he has no plans to fritter his cash away on fast cars, drink.


In [3]:
# Force upgrade datasets and fsspec to avoid the '**' error
!pip install -U fsspec datasets --no-cache-dir --quiet
!pip install transformers datasets spacy rouge-score --quiet
!python -m spacy download en_core_web_sm

# Check if GPU is available
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load summarization pipeline with proper device setting
from transformers import pipeline
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    tokenizer="facebook/bart-large-cnn",
    device=0 if torch.cuda.is_available() else -1
)

# Function to handle long texts for summarization
def summarize_large_text(text, max_chunk=1000):
    sentences = text.split('. ')
    current_chunk = ''
    chunks = []

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    if current_chunk:
        chunks.append(current_chunk.strip())

    summary = ''
    for chunk in chunks:
        summary_piece = summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
        summary += summary_piece + ' '

    return summary.strip()


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Using device: cpu


Device set to use cpu


# Step 1: Download and Load CNN/Daily Mail Dataset


In [4]:
# Install HuggingFace Datasets if not already
!pip install datasets

# Load CNN/Daily Mail dataset
from datasets import load_dataset

# Use streaming=True to bypass download file system issues
dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir="/content/dataset_cache")
train_data = dataset["train"]

# View a sample
sample = next(train_iter)
print("Article:", sample['article'][:500])
print("Summary:", sample['highlights'])




train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Article: Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s
Summary: Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


# Step 2: Preprocess Text Data


In [5]:
!pip install spacy
!python -m spacy download en_core_web_sm

import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[[^\]]*\]', '', text)
    return text.strip()

def tokenize_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Example test
article = train_data[0]['article']
cleaned = clean_text(article)
sentences = tokenize_sentences(cleaned)

print("Cleaned Text (first 300 characters):\n", cleaned[:300])
print("\nTokenized Sentences:\n", sentences[:5])


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Cleaned Text (first 300 characters):
 LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disap

# Step 3: Extractive Summarization using spaCy + TF-IDF


In [6]:
!pip install scikit-learn

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extractive_summary(text, n=3):
    cleaned_text = clean_text(text)
    sentences = tokenize_sentences(cleaned_text)

    if len(sentences) < n:
        return " ".join(sentences)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    scores = np.asarray(tfidf_matrix.sum(axis=1)).ravel()

    top_n_idx = np.argsort(scores)[-n:]
    top_n_sentences = [sentences[i] for i in sorted(top_n_idx)]

    return " ".join(top_n_sentences)

# Test
summary = extractive_summary(train_data[0]['article'])
print("Extractive Summary:\n", summary)


Extractive Summary:
 Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart.


# Step 4: Abstractive Summarization using Transformers


In [7]:
# STEP 5: Abstractive Summarization using Transformers
!pip install transformers torch --quiet

from transformers import pipeline

# Load the summarization pipeline using PyTorch
tokenizer_model = "facebook/bart-large-cnn"
summarizer = pipeline(
    "summarization",
    model=tokenizer_model,
    tokenizer=tokenizer_model,
    framework="pt"
)

# Use article from previous steps (truncate to fit BART's max token length)
article = train_data[0]['article']
input_text = article[:1024]  # truncate safely

# Generate summary
abstractive_sum = summarizer(
    input_text,
    max_length=130,
    min_length=30,
    do_sample=False,
    truncation=True
)

print("Abstractive Summary:\n", abstractive_sum[0]['summary_text'])


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Abstractive Summary:
 Harry Potter star Daniel Radcliffe turns 18 on Monday. He gains access to a reported £20 million ($41.1 million) fortune. Radcliffe says he has no plans to fritter his cash away on fast cars, drink.


# Step 5: Evaluation using ROUGE Score


In [8]:
!pip install rouge-score --quiet

from rouge_score import rouge_scorer

reference = train_data[0]['highlights']
predicted = abstractive_sum[0]['summary_text']

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, predicted)

print("\nROUGE Evaluation Scores:")
for key, value in scores.items():
    print(f"{key}: Precision={value.precision:.4f}, Recall={value.recall:.4f}, F1={value.fmeasure:.4f}")



ROUGE Evaluation Scores:
rouge1: Precision=0.5833, Recall=0.5385, F1=0.5600
rouge2: Precision=0.4000, Recall=0.3684, F1=0.3836
rougeL: Precision=0.5000, Recall=0.4615, F1=0.4800
