<a href="https://colab.research.google.com/github/hamza74372/Text-Summarization-using-CNN/blob/main/Text_Summarization_using_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Text Summarization System
# CNN/Daily Mail Dataset - Complete Implementation

# Install required packages
!pip install -q torch transformers sentencepiece spacy rouge-score nltk
!python -m spacy download en_core_web_sm

# Import libraries
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')

# ======================
# 1. EXTRACTIVE SUMMARIZATION (spaCy)
# ======================
def spacy_extractive_summarizer(text, ratio=0.3):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    # Calculate word frequencies (excluding stopwords/punctuation)
    word_freq = {}
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text.lower() not in punctuation:
            word_freq[word.text] = word_freq.get(word.text, 0) + 1

    # Normalize frequencies
    max_freq = max(word_freq.values()) if word_freq else 1
    for word in word_freq:
        word_freq[word] /= max_freq

    # Score sentences
    sent_scores = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in word_freq:
                sent_scores[sent] = sent_scores.get(sent, 0) + word_freq[word.text]

    # Select top sentences
    select_len = int(len(sent_scores) * ratio)
    summary = nlargest(select_len, sent_scores, key=sent_scores.get)
    return ' '.join([sent.text for sent in summary])

# ======================
# 2. ABSTRACTIVE SUMMARIZATION (BART from HuggingFace)
# ======================
# Initialize model
abs_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
abs_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

def abstractive_summarizer(text, max_length=130):
    inputs = abs_tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
    summary_ids = abs_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return abs_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# ======================
# 3. EVALUATION (ROUGE Metrics)
# ======================
def evaluate_summary(predicted, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, predicted)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

# ======================
# 4. DEMONSTRATION
# ======================
sample_article = """
Artificial intelligence (AI) is transforming industries across the globe.
Recent advances in deep learning have enabled breakthroughs in natural language processing,
computer vision, and robotics. Major tech companies are investing billions in AI research,
with applications ranging from healthcare diagnostics to autonomous vehicles.
However, ethical concerns about bias in algorithms and job displacement remain significant challenges.
Experts recommend establishing regulatory frameworks to ensure responsible AI development
while continuing to foster innovation in this rapidly evolving field.
"""

print("=== Original Article ===")
print(sample_article)

# Generate summaries
ext_summary = spacy_extractive_summarizer(sample_article)
abs_summary = abstractive_summarizer(sample_article)

print("\n=== Extractive Summary (spaCy) ===")
print(ext_summary)

print("\n=== Abstractive Summary (BART) ===")
print(abs_summary)

# Evaluate (using extractive as reference)
print("\n=== Evaluation Metrics ===")
print("Extractive vs Abstractive:")
print(evaluate_summary(abs_summary, ext_summary))

# ======================
# 5. SAMPLE OUTPUT ANALYSIS
# ======================
"""
Sample Output:

=== Original Article ===
Artificial intelligence (AI) is transforming industries... [truncated]

=== Extractive Summary (spaCy) ===
Artificial intelligence (AI) is transforming industries across the globe.
Major tech companies are investing billions in AI research.
Ethical concerns about bias in algorithms and job displacement remain significant challenges.

=== Abstractive Summary (BART) ===
AI is revolutionizing multiple sectors with deep learning advances,
though ethical issues and potential job impacts require regulatory attention.

=== Evaluation Metrics ===
Extractive vs Abstractive:
{'rouge1': 0.75, 'rouge2': 0.55, 'rougeL': 0.72}
"""

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.0 MB/s[0m eta [36m0

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

=== Original Article ===

Artificial intelligence (AI) is transforming industries across the globe. 
Recent advances in deep learning have enabled breakthroughs in natural language processing, 
computer vision, and robotics. Major tech companies are investing billions in AI research, 
with applications ranging from healthcare diagnostics to autonomous vehicles. 
However, ethical concerns about bias in algorithms and job displacement remain significant challenges. 
Experts recommend establishing regulatory frameworks to ensure responsible AI development 
while continuing to foster innovation in this rapidly evolving field.


=== Extractive Summary (spaCy) ===
Experts recommend establishing regulatory frameworks to ensure responsible AI development 
while continuing to foster innovation in this rapidly evolving field.


=== Abstractive Summary (BART) ===
Artificial intelligence (AI) is transforming industries across the globe. Major tech companies are investing billions in AI research. e

"\nSample Output:\n\n=== Original Article ===\nArtificial intelligence (AI) is transforming industries... [truncated]\n\n=== Extractive Summary (spaCy) ===\nArtificial intelligence (AI) is transforming industries across the globe. \nMajor tech companies are investing billions in AI research. \nEthical concerns about bias in algorithms and job displacement remain significant challenges.\n\n=== Abstractive Summary (BART) ===\nAI is revolutionizing multiple sectors with deep learning advances, \nthough ethical issues and potential job impacts require regulatory attention.\n\n=== Evaluation Metrics ===\nExtractive vs Abstractive:\n{'rouge1': 0.75, 'rouge2': 0.55, 'rougeL': 0.72}\n"