In [1]:
!pip install --upgrade fsspec datasets

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [1]:
from datasets import load_dataset,Dataset,DatasetDict
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

# Load the dataset (replace 'cnn_dailymail', '3.0.0' with your chosen dataset if different)
dataset = load_dataset('cnn_dailymail', '3.0.0')

print("1. Dataset Structure:")
print(dataset) # Shows splits, number of rows, and features

print("\n2. Column Names (Features):")
print(dataset['train'].column_names) # Shows the names of columns (e.g., 'article', 'highlights', 'id')

print("\n3. First Example from Training Split:")
# Access the first example. Use slicing for long texts to avoid overwhelming output.
first_example = dataset['train'][0]
print("Article (first 500 chars):")
print(first_example['article'][:500] + "...")
print("\nHighlights:")
print(first_example['highlights'])
print("\nID:", first_example['id'])

print("\n4. Another Example (e.g., 5th example from validation split):")
second_example = dataset['validation'][4] # Using index 4 for the 5th example
print("Article (first 500 chars):")
print(second_example['article'][:500] + "...")
print("\nHighlights:")
print(second_example['highlights'])

1. Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

2. Column Names (Features):
['article', 'highlights', 'id']

3. First Example from Training Split:
Article (first 500 chars):
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s...

Highlights:
Har

In [3]:


# Assuming 'dataset' is already loaded from Step 1

print("\n--- Step 2: Removing Duplicates ---")

# Function to remove duplicates from a single split
def remove_duplicates_from_split(split_name, data_split):
    initial_size = len(data_split)
    df = pd.DataFrame(data_split)
    # Define columns that constitute a unique example. For summarization, it's typically article and highlights.
    df.drop_duplicates(subset=['article', 'highlights'], inplace=True)
    cleaned_split = Dataset.from_pandas(df)
    print(f"Original {split_name} size: {initial_size}")
    print(f"{split_name} size after deduplication: {len(cleaned_split)}")
    return cleaned_split

# Apply to each split
cleaned_dataset = DatasetDict({
    'train': remove_duplicates_from_split('train', dataset['train']),
    'validation': remove_duplicates_from_split('validation', dataset['validation']),
    'test': remove_duplicates_from_split('test', dataset['test'])
})

# Verify the structure remains the same
print("\nCleaned Dataset Structure (after deduplication):")
print(cleaned_dataset)


--- Step 2: Removing Duplicates ---
Original train size: 287113
train size after deduplication: 284015
Original validation size: 13368
validation size after deduplication: 13368
Original test size: 11490
test size after deduplication: 11488

Cleaned Dataset Structure (after deduplication):
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', '__index_level_0__'],
        num_rows: 284015
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', '__index_level_0__'],
        num_rows: 11488
    })
})


In [4]:
# Assuming 'cleaned_dataset' is the result from Step 2

print("\n--- Step 3: Handling Missing/Empty Values ---")

def remove_empty_examples(example):
    # Check if both 'article' and 'highlights' exist and are non-empty strings
    return bool(example['article'] and example['highlights'] and
                isinstance(example['article'], str) and isinstance(example['highlights'], str))

# Apply this filter to all splits
cleaned_dataset = cleaned_dataset.filter(remove_empty_examples)

print(f"Train size after removing empty examples: {len(cleaned_dataset['train'])}")
print(f"Validation size after removing empty examples: {len(cleaned_dataset['validation'])}")
print(f"Test size after removing empty examples: {len(cleaned_dataset['test'])}")

print("\nCleaned Dataset Structure (after empty examples removal):")
print(cleaned_dataset)


--- Step 3: Handling Missing/Empty Values ---


Filter: 100%|████████████████| 284015/284015 [00:01<00:00, 201662.89 examples/s]
Filter: 100%|██████████████████| 13368/13368 [00:00<00:00, 227052.40 examples/s]
Filter: 100%|██████████████████| 11488/11488 [00:00<00:00, 203854.90 examples/s]

Train size after removing empty examples: 284015
Validation size after removing empty examples: 13368
Test size after removing empty examples: 11488

Cleaned Dataset Structure (after empty examples removal):
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', '__index_level_0__'],
        num_rows: 284015
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', '__index_level_0__'],
        num_rows: 11488
    })
})





In [5]:
import re

# Assuming 'cleaned_dataset' is the result from Step 3

print("\n--- Step 4: Basic Text Normalization ---")

def normalize_text_fields(examples):
    # Process 'article'
    normalized_articles = []
    for text in examples['article']:
        text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
        text = text.strip()              # Remove leading/trailing whitespace
        normalized_articles.append(text)

    # Process 'highlights'
    normalized_highlights = []
    for text in examples['highlights']:
        text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
        text = text.strip()              # Remove leading/trailing whitespace
        normalized_highlights.append(text)

    return {'article': normalized_articles, 'highlights': normalized_highlights}

# Apply this normalization to all splits
cleaned_dataset = cleaned_dataset.map(normalize_text_fields, batched=True)

print(f"Example from train split after normalization (first 500 chars):")
print("Article:", cleaned_dataset['train'][0]['article'][:500] + "...")
print("Highlights:", cleaned_dataset['train'][0]['highlights'])


--- Step 4: Basic Text Normalization ---


Map: 100%|█████████████████████| 284015/284015 [00:38<00:00, 7438.00 examples/s]
Map: 100%|███████████████████████| 13368/13368 [00:01<00:00, 7698.40 examples/s]
Map: 100%|███████████████████████| 11488/11488 [00:01<00:00, 7445.19 examples/s]

Example from train split after normalization (first 500 chars):
Article: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s...
Highlights: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday . Young actor says he has no plans to fritter his cash away . Radcliffe's earnings from first five Potter films have been held in trust fund .





In [9]:
from transformers import AutoTokenizer
from datasets import DatasetDict # Ensure DatasetDict is imported if you're using it

# Assuming 'cleaned_dataset' is the result from Step 4

print("\n--- Step 5: Tokenization ---")

# Choose your pre-trained model checkpoint.
# This choice dictates the tokenizer and the model architecture you'll use for fine-tuning.
# For summarization, common choices are "facebook/bart-large-cnn", "t5-base", or "google/pegasus-cnn_dailymail"
model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define maximum input and target lengths.
# These are typically derived from the model's architecture limits and dataset statistics.
# For BART/T5, max_input_length is often 1024. For CNN/DM highlights, 128-150 is usually sufficient.
max_input_length = 1024
max_target_length = 128

def tokenize_function(examples):
    # Tokenize the input articles
    # truncation=True: cuts off text longer than max_input_length
    # padding="max_length": pads shorter texts to max_input_length with zeros
    model_inputs = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # Tokenize the target summaries (highlights)
    # The 'labels' key is specifically expected by the Hugging Face Trainer for sequence-to-sequence models.
    labels = tokenizer(
        examples["highlights"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )

    # Assign the tokenized labels (input_ids) to the 'labels' key in model_inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the tokenization function to all splits of your dataset.
# 'remove_columns' removes the original text columns to save memory, as they are no longer needed.
tokenized_dataset = cleaned_dataset.map(
    tokenize_function,
    batched=True,
    # num_proc=4, # Keep this commented out or set to 1 for now to avoid the multiprocessing error
    # num_proc=1, # Explicitly use a single process for debugging/stability
    # --- CRITICAL CHANGE HERE ---
    # Explicitly list all columns to remove, including the pandas index column
    remove_columns=['article', 'highlights', 'id', '__index_level_0__']
)

print("\nTokenized Dataset Structure:")
print(tokenized_dataset)
print("\nFirst Tokenized Example (training split - showing input_ids and labels):")
print("Article Input IDs (first 20):", tokenized_dataset['train'][0]['input_ids'][:20])
print("Article Attention Mask (first 20):", tokenized_dataset['train'][0]['attention_mask'][:20])
print("Highlights Labels (first 20):", tokenized_dataset['train'][0]['labels'][:20])



--- Step 5: Tokenization ---


Map: 100%|██████████████████████| 284015/284015 [07:06<00:00, 665.49 examples/s]


ValueError: Column to remove ['__index_level_0__'] not in the dataset. Current columns in the dataset: ['article', 'highlights', 'id']

In [10]:
from datasets import load_dataset, Dataset, DatasetDict # Ensure Dataset, DatasetDict are imported
import pandas as pd
from transformers import AutoTokenizer # Added AutoTokenizer import here for clarity

# --- Step 4: Load and Deduplicate Dataset ---
# Load the CNN/DailyMail dataset (using small samples for debugging)
print("Loading CNN/DailyMail dataset (small samples for debugging)...")

# Load a very small subset for faster debugging
train_dataset_small = load_dataset("cnn_dailymail", "3.0.0", split="train[:1000]")
validation_dataset_small = load_dataset("cnn_dailymail", "3.0.0", split="validation[:100]")
test_dataset_small = load_dataset("cnn_dailymail", "3.0.0", split="test[:100]")

# Combine into a DatasetDict
cleaned_dataset = DatasetDict({
    'train': train_dataset_small,
    'validation': validation_dataset_small,
    'test': test_dataset_small
})

print("Initial small dataset samples loaded successfully!")
print(f"Train split size: {len(cleaned_dataset['train'])}")
print(f"Validation split size: {len(cleaned_dataset['validation'])}")
print(f"Test split size: {len(cleaned_dataset['test'])}")


# Function to remove duplicates from a dataset split
def remove_duplicates_from_split(split_name, split_dataset):
    initial_size = len(split_dataset)
    # Convert to pandas DataFrame for easy deduplication
    df = split_dataset.to_pandas()
    # Define columns that constitute a unique example. For summarization, it's typically article and highlights.
    df.drop_duplicates(subset=['article', 'highlights'], inplace=True)
    # CRITICAL CHANGE: Reset index and drop the old index column to prevent __index_level_0__
    df = df.reset_index(drop=True)
    # Convert back to Hugging Face Dataset
    cleaned_split = Dataset.from_pandas(df)
    print(f"Original {split_name} size: {initial_size}")
    print(f"{split_name} size after deduplication: {len(cleaned_split)}")
    return cleaned_split

print("\nStarting deduplication process on small samples...")
# Apply deduplication to each split
cleaned_dataset['train'] = remove_duplicates_from_split('train', cleaned_dataset['train'])
cleaned_dataset['validation'] = remove_duplicates_from_split('validation', cleaned_dataset['validation'])
cleaned_dataset['test'] = remove_duplicates_from_split('test', cleaned_dataset['test'])

# Verify the structure remains the same
print("\nCleaned Dataset Structure (after deduplication):")
print(cleaned_dataset)


# --- Step 5: Tokenization ---
print("\n--- Step 5: Tokenization ---")

# Choose your pre-trained model checkpoint.
# For summarization, common choices are "facebook/bart-large-cnn", "t5-base", or "google/pegasus-cnn_dailymail"
model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define maximum input and target lengths.
max_input_length = 1024
max_target_length = 128

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        examples["highlights"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the tokenization function to all splits of your dataset.
# 'remove_columns' removes the original text columns to save memory, as they are no longer needed.
# Now, we only remove the original 'article', 'highlights', 'id' columns.
tokenized_dataset = cleaned_dataset.map(
    tokenize_function,
    batched=True,
    # num_proc=4, # Keep this commented out or set to 1 for now to avoid multiprocessing errors
    # num_proc=1, # Explicitly use a single process for debugging/stability
    remove_columns=['article', 'highlights', 'id'] # Removed '__index_level_0__' as it's no longer created
)

print("\nTokenized Dataset Structure:")
print(tokenized_dataset)
print("\nFirst Tokenized Example (training split - showing input_ids and labels):")
print("Article Input IDs (first 20):", tokenized_dataset['train'][0]['input_ids'][:20])
print("Article Attention Mask (first 20):", tokenized_dataset['train'][0]['attention_mask'][:20])
print("Highlights Labels (first 20):", tokenized_dataset['train'][0]['labels'][:20])


Loading CNN/DailyMail dataset (small samples for debugging)...
Initial small dataset samples loaded successfully!
Train split size: 1000
Validation split size: 100
Test split size: 100

Starting deduplication process on small samples...
Original train size: 1000
train size after deduplication: 959
Original validation size: 100
validation size after deduplication: 100
Original test size: 100
test size after deduplication: 100

Cleaned Dataset Structure (after deduplication):
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 959
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 100
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 100
    })
})

--- Step 5: Tokenization ---


Map: 100%|████████████████████████████| 959/959 [00:01<00:00, 759.15 examples/s]
Map: 100%|████████████████████████████| 100/100 [00:00<00:00, 702.69 examples/s]
Map: 100%|████████████████████████████| 100/100 [00:00<00:00, 698.00 examples/s]


Tokenized Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 959
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

First Tokenized Example (training split - showing input_ids and labels):
Article Input IDs (first 20): [0, 574, 4524, 6, 1156, 36, 1251, 43, 480, 3268, 10997, 999, 3028, 7312, 20152, 3077, 899, 7, 10, 431]
Article Attention Mask (first 20): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Highlights Labels (first 20): [0, 29345, 10997, 999, 3028, 7312, 20152, 1516, 984, 844, 448, 13016, 25, 37, 4072, 504, 302, 479, 50118, 22138]





In [13]:
import sys
import os
import torch
import numpy as np
import evaluate # For ROUGE metric
from datasets import load_dataset, Dataset, DatasetDict # Ensure Dataset, DatasetDict are imported
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM, # For sequence-to-sequence models like BART
    DataCollatorForSeq2Seq, # For dynamic padding of batches
    Seq2SeqTrainingArguments, # Use Seq2SeqTrainingArguments for seq2seq models
    Seq2SeqTrainer, # Use Seq2SeqTrainer for seq2seq models
    pipeline
)
import collections
import matplotlib.pyplot as plt
import seaborn as sns


ModuleNotFoundError: No module named 'evaluate'

In [12]:

#Cell 4: Load Model, Define Data Collator, and Metrics
print("\n--- Step 6: Load Model, Data Collator, and Metrics ---")

# Load the pre-trained BART model for sequence-to-sequence (Seq2Seq) tasks
# 'from_pretrained' will download the weights if not cached.
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded and moved to: {device}")

# Data Collator: Dynamically pads input sequences and labels to the longest sequence in the batch.
# This is more efficient than static padding to max_length for all examples.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
print("Data Collator initialized.")

# Load the ROUGE metric from the 'evaluate' library
# ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is standard for summarization.
metric = evaluate.load("rouge")
print("ROUGE metric loaded.")

# Define the compute_metrics function for evaluation during training
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode predictions to text, skipping special tokens
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in labels as we can't decode them.
    # -100 is used by the Trainer to ignore padding tokens in loss calculation.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode labels to text, skipping special tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Add a bit of post-processing to the results (for easier readability)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

print("Compute metrics function defined.")



--- Step 6: Load Model, Data Collator, and Metrics ---


NameError: name 'torch' is not defined

In [None]:

# Cell 5: Define Training Arguments and Trainer
print("\n--- Step 7: Define Training Arguments and Trainer ---")

# Define Training Arguments for Seq2Seq models
training_args = Seq2SeqTrainingArguments(
    output_dir="./summarization_results", # Directory to save model checkpoints and logs
    num_train_epochs=3, # Number of training epochs (can increase for full dataset)
    per_device_train_batch_size=4, # Reduced for 6GB GPU (effective batch size 4 * 2 = 8 with grad accumulation)
    gradient_accumulation_steps=2, # Accumulate gradients over 2 batches
    per_device_eval_batch_size=4, # Batch size for evaluation
    learning_rate=2e-5, # Learning rate
    weight_decay=0.01, # L2 regularization
    fp16=torch.cuda.is_available(), # Enable mixed precision if GPU is available
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch", # Save checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model based on evaluation metric
    metric_for_best_model="eval_loss", # Use validation loss to determine best model initially
    greater_is_better=False, # Lower loss is better
    predict_with_generate=True, # Crucial for summarization: generate text during evaluation
    logging_dir="./summarization_logs", # Directory for logs
    logging_steps=50, # Log training loss every 50 steps
    report_to="none", # Prevent external reporting
    # Generation parameters for evaluation (can be tuned)
    generation_max_length=max_target_length,
    generation_num_beams=4, # Number of beams for beam search decoding
)

# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Use the ROUGE metric
)
print("Trainer initialized.")


# Cell 6: Execute Training
print("\n--- Step 8: Execute Training ---")
print("Starting fine-tuning of BART for Summarization...")
trainer.train()
print("Fine-tuning complete.")