# Text Summarization Project


In [6]:
!pip install transformers datasets torch rouge-score accelerate pandas numpy
!pip install transformers[torch] -U
!pip install accelerate -U
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=1de150359787844b17db36c3d53c0494090f5636118f3341e80b112dd118fb5e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting transformers[torch]
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers[torch])
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers

In [8]:
# Import libraries
import torch
import pandas as pd
import numpy as np
from collections import Counter
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from rouge_score import rouge_scorer

## View Dataset Details

In [9]:

dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:2000]")  

print("Dataset Info:")
print(dataset)

articles = dataset['article']
highlights = dataset['highlights']
print("\nDataset Statistics:")
print(f"Number of samples: {len(dataset)}")
print(f"Average article length (words): {np.mean([len(article.split()) for article in articles]):.2f}")
print(f"Average summary length (words): {np.mean([len(highlight.split()) for highlight in highlights]):.2f}")

Dataset Info:
Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 2000
})

Dataset Statistics:
Number of samples: 2000
Average article length (words): 601.81
Average summary length (words): 43.15


## Preprocessing with Deduplication and Normalization

In [11]:

def deduplicate_dataset(dataset):
    unique_articles = Counter(dataset['article'])
    unique_indices = [i for i, article in enumerate(dataset['article']) if unique_articles[article] == 1 or list(unique_articles.keys()).index(article) == i]
    return dataset.select(unique_indices)

deduped_dataset = deduplicate_dataset(dataset)
print(f"Dataset size after deduplication: {len(deduped_dataset)}")


model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    # Normalize: lowercase and strip whitespace
    inputs = ["summarize: " + doc.lower().strip() for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
encoded_dataset = deduped_dataset.map(preprocess_function, batched=True)

# Split into train and test sets
train_dataset = encoded_dataset.shuffle(seed=42).select(range(int(0.8 * len(encoded_dataset))))
test_dataset = encoded_dataset.shuffle(seed=42).select(range(int(0.8 * len(encoded_dataset)), len(encoded_dataset)))

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Dataset size after deduplication: 1784
Training set size: 1427
Test set size: 357


## Model Setup

In [12]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## Training

In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Initialize Trainer
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=None,  # Default collator handles 'labels'
    tokenizer=tokenizer,
    # Removed: label_names=["labels"]
)

# Train the model
trainer.train()

  # else its likely a filename if supported
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.2242,1.07089
2,1.1246,1.06002
3,1.043,1.06135




TrainOutput(global_step=537, training_loss=1.2083437065394453, metrics={'train_runtime': 155.0088, 'train_samples_per_second': 27.618, 'train_steps_per_second': 3.464, 'total_flos': 579398252101632.0, 'train_loss': 1.2083437065394453, 'epoch': 3.0})

## Evaluation After Fine-Tuning

In [16]:
# Generate summaries for evaluation
device = torch.device("cpu")  # Use CPU for MacBook
model.to(device)

def generate_summary_batch(batch):
    with torch.no_grad():
        input_ids = tokenizer(batch["article"], padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        output = model.generate(
            input_ids["input_ids"],
            max_length=150,
            num_beams=5,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            early_stopping=True
        )
        summaries = tokenizer.batch_decode(output, skip_special_tokens=True)
    return {"summary": summaries}

summaries = test_dataset.map(generate_summary_batch, batched=True, batch_size=8)

# Calculate ROUGE scores
def calculate_rouge(reference_list, generated_list):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    for ref, gen in zip(reference_list, generated_list):
        scores = scorer.score(ref, gen)
        for key in rouge_scores:
            rouge_scores[key] += scores[key].fmeasure
    for key in rouge_scores:
        rouge_scores[key] /= len(reference_list)
    return rouge_scores

reference_summaries = [example["highlights"] for example in test_dataset]
generated_summaries = summaries["summary"]  # Corrected line

rouge_scores = calculate_rouge(reference_summaries, generated_summaries)
print("Post-Fine-Tuning ROUGE Scores:")
print(f"Average ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"Average ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"Average ROUGE-L: {rouge_scores['rougeL']:.4f}")

Post-Fine-Tuning ROUGE Scores:
Average ROUGE-1: 0.3191
Average ROUGE-2: 0.1172
Average ROUGE-L: 0.2229


## Hyperparameter Tuning

In [23]:
# Define generate_summary_batch to ensure CPU usage
def generate_summary_batch(batch):
    with torch.no_grad():
        input_ids = tokenizer(batch["article"], padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        output = model.generate(
            input_ids["input_ids"],
            max_length=150,
            num_beams=5,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            early_stopping=True
        )
        summaries = tokenizer.batch_decode(output, skip_special_tokens=True)
    return {"summary": summaries}

# Hyperparameter tuning (grid search)
learning_rates = [1e-4, 3e-4]
batch_sizes = [2, 4]
best_metrics = None
best_lr = None
best_bs = None
best_model_path = None

device = torch.device("cuda")  # Ensure CPU usage
torch.cuda.empty_cache()  # Clear any CUDA cache

for lr in learning_rates:
    for bs in batch_sizes:
        print(f"\nTuning with learning_rate={lr}, batch_size={bs}")
        training_args = TrainingArguments(
            output_dir=f"./results_lr_{lr}_bs_{bs}",
            eval_strategy="epoch",
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs=1,
            weight_decay=0.01,
            save_total_limit=1,
            logging_dir=f"./logs_lr_{lr}_bs_{bs}",
            logging_steps=10,
            report_to="none"
        )
        model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
        model.to(device)  # Double-check model is on CPU
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            data_collator=None,
            processing_class=tokenizer
        )
        trainer.train()
        summaries = test_dataset.map(generate_summary_batch, batched=True, batch_size=8)
        generated_summaries = summaries["summary"]
        rouge_scores = calculate_rouge(reference_summaries, generated_summaries)
        metrics = {"eval_rouge1": rouge_scores['rouge1'], "eval_rouge2": rouge_scores['rouge2'], "eval_rougeL": rouge_scores['rougeL']}
        print(f"Metrics for lr={lr}, bs={bs}: {metrics}")
        if best_metrics is None or metrics['eval_rouge1'] > best_metrics['eval_rouge1']:
            best_metrics = metrics
            best_lr = lr
            best_bs = bs
            best_model_path = f"./results_lr_{lr}_bs_{bs}/best_model"
            trainer.save_model(best_model_path)

print(f"\nBest Hyperparameters: learning_rate={best_lr}, batch_size={best_bs}")
print(f"Best Metrics: {best_metrics}")

# Load best model
model = T5ForConditionalGeneration.from_pretrained(best_model_path).to(device)
tokenizer = T5Tokenizer.from_pretrained(best_model_path)


Tuning with learning_rate=0.0001, batch_size=2




Epoch,Training Loss,Validation Loss
1,1.272,1.090717




Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Metrics for lr=0.0001, bs=2: {'eval_rouge1': 0.30371035675317, 'eval_rouge2': 0.10734313969488142, 'eval_rougeL': 0.21192910030368214}

Tuning with learning_rate=0.0001, batch_size=4




Epoch,Training Loss,Validation Loss
1,1.3019,1.127873




Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Metrics for lr=0.0001, bs=4: {'eval_rouge1': 0.2971990753138704, 'eval_rouge2': 0.10535934405959128, 'eval_rougeL': 0.20685579900832413}

Tuning with learning_rate=0.0003, batch_size=2




Epoch,Training Loss,Validation Loss
1,1.2372,1.071153




Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Metrics for lr=0.0003, bs=2: {'eval_rouge1': 0.3194840316372372, 'eval_rouge2': 0.11713891447382245, 'eval_rougeL': 0.2237683616910083}

Tuning with learning_rate=0.0003, batch_size=4




Epoch,Training Loss,Validation Loss
1,1.2307,1.078225




Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Metrics for lr=0.0003, bs=4: {'eval_rouge1': 0.3158887426640696, 'eval_rouge2': 0.11568756738285062, 'eval_rougeL': 0.2194648552131153}

Best Hyperparameters: learning_rate=0.0003, batch_size=2
Best Metrics: {'eval_rouge1': 0.3194840316372372, 'eval_rouge2': 0.11713891447382245, 'eval_rougeL': 0.2237683616910083}


## Inference

In [24]:
# Inference function
def summarize_text(text, model, tokenizer, max_length=150):
    inputs = tokenizer("summarize: " + text.lower().strip(), return_tensors="pt", max_length=512, truncation=True).to(device)
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_beams=5,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            early_stopping=True
        )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Display examples
for i in range(3):
    print(f"\nExample {i+1}:")
    print("Article:", test_dataset[i]["article"][:500] + "...")
    print("Original Summary:", test_dataset[i]["highlights"])
    print("Generated Summary:", summarize_text(test_dataset[i]["article"], model, tokenizer))
    print("\n")


Example 1:
Article: (CNN)  -- Canadian Prime Minister Stephen Harper said Thursday that Canada's governor general has allowed him to suspend Parliament, postponing a no-confidence vote from his opponents that he was likely to lose. Canadian Prime Minister Stephen Harper says Parliament will resume on January 26. Harper called on his opponents to work with his government on measures to aid the nation's economy when Parliament returns on January 26. "The first order of business will be the presentation of a federal b...
Original Summary: NEW: Opposition accuses PM Harper of putting his job ahead of Canada's interests .
Move postpones opposition parties' plan for no-confidence vote next week .
Liberal and New Democratic parties join with Bloc Quebecois to try to unseat Tories .
Vote likely would have brought down Canada's Conservative government .
Generated Summary: Canadian prime minister stephen Harper says he will suspend parliament. Harper calls on opponents to work on measures to ai