<a href="https://colab.research.google.com/github/leman-cap13/my_projects/blob/main/BBC_News_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download pariza/bbc-news-summary

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/content/bbc-news-summary.zip','r') as zip_ref:
  zip_ref.extractall()

In [None]:
import pandas as pd

In [None]:
import os

folder_path = '/content/BBC News Summary/News Articles'
file_list = os.listdir(folder_path)

print(file_list)

In [None]:
import os

folder_path = '/content/BBC News Summary/Summaries'
file_list = os.listdir(folder_path)

print(file_list)


In [None]:
import os
import pandas as pd

base_path = '/content/BBC News Summary/News Articles'
categories = ['sport', 'business', 'tech', 'politics', 'entertainment']

data = []

for category in categories:
    folder_path = os.path.join(base_path, category)
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:

            content = file.read()
            data.append({
                'category': category,
                'content': content
            })

df = pd.DataFrame(data)


In [None]:
df

In [None]:
print(df.head())
print(df['category'].value_counts())


In [None]:
base_path = '/content/BBC News Summary/Summaries'
categories = ['sport', 'business', 'tech', 'politics', 'entertainment']

data = []

for category in categories:
    folder_path = os.path.join(base_path, category)
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:

            content = file.read()
            data.append({
                'category': category,
                'content': content
            })

df_sum = pd.DataFrame(data)

In [None]:
df_sum

In [None]:
df_sum['category'].value_counts()

In [None]:
# Loading Articles
articles = []

for category in categories:
    folder_path = os.path.join('/content/BBC News Summary/News Articles', category)
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            articles.append({
                'filename': filename,
                'category': category,
                'content': content
            })

df_articles = pd.DataFrame(articles)

# Loading Summaries
summaries = []

for category in categories:
    folder_path = os.path.join('/content/BBC News Summary/Summaries', category)
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            summary = file.read()
            summaries.append({
                'filename': filename,
                'category': category,
                'summary': summary
            })

df_summaries = pd.DataFrame(summaries)

# Merge on filename and category
df_full = pd.merge(df_articles, df_summaries, on=['filename', 'category'])


In [None]:
df_full.drop('filename',axis=1,inplace=True)

In [None]:
df_full

# Data Cleaning

I will perform these cleaning steps:

Lowercasing

Removing punctuation

Removing numbers

Removing stopwords

Tokenization

Lemmatization

In [None]:
import nltk

In [None]:
nltk.download('stopwords') #for text summarization tasks, removing stopwords is NOT a good idea.
nltk.download('punkt')     # For text summarization task, removion punnk is not good idea
nltk.download('wordnet')

punkt is a pre-trained sentence tokenizer. It helps NLTK split text into sentences and words

nltk.download('stopwords')
Downloads a list of common English stopwords (like "the", "is", "and", etc.).

WordNet is a lexical database of English. It contains meanings (synsets), relationships, and morphological info of words.

In [None]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df_full['clean_content'] = df_full['content'].apply(preprocess_text)
df_full['clean_summary'] = df_full['summary'].apply(preprocess_text)


In [None]:
df_full.drop(['content','summary'],axis=1,inplace=True)

In [None]:
df_full['clean_content'][0]

In [None]:
df_full['clean_summary'][0]

In [None]:
#Plan
#1.AutoTokenizer
#2.Dataloader TensorDataset
#3.Loss optimizer accelerator
#4.Custom training loop
#5.model generate

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_full, test_size=0.1, random_state=42)

#1.Tokenization

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
train_inputs = list(train_df['clean_content'])   # No prefix!
train_targets = list(train_df['clean_summary'])  # This stays the same



That "summarize: " prefix is specific to T5, because T5 is trained as a general-purpose text-to-text transformer, and it uses task-specific prefixes like "translate English to German:", "summarize:", etc.

 For Pegasus, you should remove the "summarize: " prefix:

In [None]:
#For T5, you need to prefix your input with "summarize:
# train_inputs = ["summarize: " + text for text in train_df['clean_content']]
# train_targets = list(train_df['clean_summary'])


In [None]:
train_inputs[0]

In [None]:
train_targets[0]

In [None]:
inputs_tokenized = tokenizer(
    train_inputs,
    max_length=256,
    padding='max_length',
    truncation=True,
    return_tensors='pt')

targets_tokenized = tokenizer(
    train_targets,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt')

In [None]:
inputs_tokenized['input_ids']

In [None]:
inputs_tokenized['attention_mask']

In [None]:
targets_tokenized['input_ids']

In [None]:
#T5 expects padding tokens in labels to be -100 so that they’re ignored in loss calculation.

labels = targets_tokenized['input_ids']
labels[labels == tokenizer.pad_token_id] = -100

In [None]:
val_inputs = list(val_df['clean_content'])
val_targets = list(val_df['clean_summary'])

val_inputs_tokenized = tokenizer(
    val_inputs,
    max_length=256,
    padding='max_length',
    truncation=True,
    return_tensors='pt')

val_targets_tokenized = tokenizer(
    val_targets,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt')

val_labels = val_targets_tokenized['input_ids']
val_labels[val_labels == tokenizer.pad_token_id] = -100


#2.DataLoader with TensorDataset

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(
    inputs_tokenized['input_ids'],
    inputs_tokenized['attention_mask'],
    labels
)

val_dataset = TensorDataset(
    val_inputs_tokenized['input_ids'],
    val_inputs_tokenized['attention_mask'],
    val_labels
)


In [None]:
train_dataset

In [None]:
batch_size = 2

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True)


val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False)

In [None]:
train_loader

In [None]:
for batch in train_loader:
    input_ids, attention_mask, labels = batch
    print(input_ids.shape, attention_mask.shape, labels.shape)
    break

# Model, Optimizer, Accelerator

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_loader, val_loader = accelerator.prepare(
    model, optimizer, train_loader, val_loader)

In [None]:
from tqdm import tqdm
from torch.nn.utils import clip_grad_norm_

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

progress_bar = tqdm(range(num_training_steps))
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        accelerator.backward(loss)

        # Optional: Gradient clipping for stability
        clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()
        progress_bar.update(1)

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)


    # Evaluation
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)



In [None]:
!pip install seaborn

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Create x-axis labels for epochs
epochs = range(1, len(train_losses) + 1)

# Plot losses
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, label='Training Loss', marker='o')
plt.plot(epochs, val_losses, label='Validation Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


#Evaluate  model with .generate()

Generate summaries from your fine-tuned model

Compare them to reference summaries

Optionally: Evaluate with metrics like ROUGE

In [None]:
import torch
import textwrap

model.eval()
n_samples = 5  # number of samples to generate

for i in range(n_samples):
    input_text = val_df['clean_content'].iloc[i]
    reference_summary = val_df['clean_summary'].iloc[i]

    input_ids = tokenizer(
        input_text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    ).input_ids.to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            max_length=128,
            do_sample=True,        # enable sampling
            num_beams=4,           # beam search with 4 beams
            top_k=50,              # top-k sampling
            top_p=0.95,            # nucleus sampling
            temperature=0.9,       # temperature for randomness
            early_stopping=True
        )
        generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print("=" * 100)
    print(f"\n📰 ORIGINAL ARTICLE:\n{textwrap.fill(input_text[:300], width=100)}...")
    print(f"\n✅ REFERENCE SUMMARY:\n{textwrap.fill(reference_summary, width=100)}")
    print(f"\n🔁 GENERATED SUMMARY (sampled beam search):\n{textwrap.fill(generated_summary, width=100)}")



#ROUGE evaluation

In [None]:
!pip install rouge-score

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

generated_summaries = []
reference_summaries = []

model.eval()
n_samples = 5  # or change as needed

for i in range(n_samples):
    input_text = val_df['clean_content'].iloc[i]
    reference_summary = val_df['clean_summary'].iloc[i]

    input_ids = tokenizer(
        input_text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    ).input_ids.to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            max_length=128,
            do_sample=True,
            num_beams=4,
            top_k=50,
            top_p=0.95,
            temperature=0.9,
            early_stopping=True
        )
        generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    generated_summaries.append(generated_summary)
    reference_summaries.append(reference_summary)

# Calculate ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for ref, gen in zip(reference_summaries, generated_summaries):
    scores = scorer.score(ref, gen)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

print(f"Average ROUGE-1 F1 Score: {sum(rouge1_scores)/len(rouge1_scores):.4f}")
print(f"Average ROUGE-2 F1 Score: {sum(rouge2_scores)/len(rouge2_scores):.4f}")
print(f"Average ROUGE-L F1 Score: {sum(rougeL_scores)/len(rougeL_scores):.4f}")


✅ Your 0.67 ROUGE-1 is very good
✅ Your 0.59 ROUGE-2 is excellent
✅ Your 0.46 ROUGE-L is solid

So don’t chase 1.0 — it’s not realistic or necessary. Instead, if you want to go further:

Compare against human summaries.

Try BLEU, METEOR, or even BERTScore for deeper semantic comparison.

Perform qualitative evaluation: are your summaries coherent, concise, and factual?

In [None]:
input_text = "In a surprising turn of events, the government announced a new initiative aimed at tackling climate change by investing over $10 billion into renewable energy projects across the country. The plan includes the construction of wind farms, solar power stations, and funding for research into energy storage technologies. Experts believe this move could significantly reduce the nation’s carbon footprint over the next decade, although some critics argue that the plan lacks clear short-term implementation goals. Nevertheless, the announcement has been met with cautious optimism from environmental groups and industry leaders alike."
input_ids = tokenizer(
    input_text,
    return_tensors='pt',
    truncation=True,
    padding='max_length',
    max_length=512).input_ids.to(model.device)

with torch.no_grad():
    summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)


In [None]:
import textwrap

print(textwrap.fill(summary, width=100))


In [None]:
import matplotlib.pyplot as plt

gen_lengths = [len(s.split()) for s in generated_summaries]
ref_lengths = [len(s.split()) for s in reference_summaries]

plt.figure(figsize=(10,5))
plt.hist(gen_lengths, bins=10, alpha=0.7, label='Generated')
plt.hist(ref_lengths, bins=10, alpha=0.7, label='Reference')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Distribution of Summary Lengths')
plt.legend()
plt.show()


In [None]:
min_len = min(len(val_df['clean_content']), len(reference_summaries), len(generated_summaries))
print("Minimum length:", min_len)

df_eval = pd.DataFrame({
    "Original Text": val_df['clean_content'][:min_len].values,
    "Reference Summary": reference_summaries[:min_len],
    "Generated Summary": generated_summaries[:min_len]
})


df_eval['input_len'] = df_eval['Original Text'].apply(lambda x: len(x.split()))
df_eval['gen_len'] = df_eval['Generated Summary'].apply(lambda x: len(x.split()))
df_eval['ref_len'] = df_eval['Reference Summary'].apply(lambda x: len(x.split()))

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(x='input_len', y='gen_len', data=df_eval, label='Generated')
sns.scatterplot(x='input_len', y='ref_len', data=df_eval, label='Reference')
plt.xlabel("Input Length (words)")
plt.ylabel("Summary Length (words)")
plt.title("Input vs Summary Length")
plt.legend()
plt.grid(True)
plt.show()
