In [1]:
!pip install --upgrade fsspec datasets

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [4]:
from datasets import load_dataset
import pandas as pd

In [5]:
from datasets import load_dataset

# Load the dataset (replace 'cnn_dailymail', '3.0.0' with your chosen dataset if different)
dataset = load_dataset('cnn_dailymail', '3.0.0')

print("1. Dataset Structure:")
print(dataset) # Shows splits, number of rows, and features

print("\n2. Column Names (Features):")
print(dataset['train'].column_names) # Shows the names of columns (e.g., 'article', 'highlights', 'id')

print("\n3. First Example from Training Split:")
# Access the first example. Use slicing for long texts to avoid overwhelming output.
first_example = dataset['train'][0]
print("Article (first 500 chars):")
print(first_example['article'][:500] + "...")
print("\nHighlights:")
print(first_example['highlights'])
print("\nID:", first_example['id'])

print("\n4. Another Example (e.g., 5th example from validation split):")
second_example = dataset['validation'][4] # Using index 4 for the 5th example
print("Article (first 500 chars):")
print(second_example['article'][:500] + "...")
print("\nHighlights:")
print(second_example['highlights'])

Generating train split: 100%|█| 287113/287113 [00:03<00:00, 92089.54 examples/s]
Generating validation split: 100%|█| 13368/13368 [00:00<00:00, 112869.63 example
Generating test split: 100%|███| 11490/11490 [00:00<00:00, 126943.77 examples/s]

1. Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

2. Column Names (Features):
['article', 'highlights', 'id']

3. First Example from Training Split:
Article (first 500 chars):
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s...

Highlights:
Har




In [6]:


# Assuming 'dataset' is already loaded from Step 1

print("\n--- Step 2: Removing Duplicates ---")

# Function to remove duplicates from a single split
def remove_duplicates_from_split(split_name, data_split):
    initial_size = len(data_split)
    df = pd.DataFrame(data_split)
    # Define columns that constitute a unique example. For summarization, it's typically article and highlights.
    df.drop_duplicates(subset=['article', 'highlights'], inplace=True)
    cleaned_split = Dataset.from_pandas(df)
    print(f"Original {split_name} size: {initial_size}")
    print(f"{split_name} size after deduplication: {len(cleaned_split)}")
    return cleaned_split

# Apply to each split
cleaned_dataset = DatasetDict({
    'train': remove_duplicates_from_split('train', dataset['train']),
    'validation': remove_duplicates_from_split('validation', dataset['validation']),
    'test': remove_duplicates_from_split('test', dataset['test'])
})

# Verify the structure remains the same
print("\nCleaned Dataset Structure (after deduplication):")
print(cleaned_dataset)


--- Step 2: Removing Duplicates ---


NameError: name 'DatasetDict' is not defined

In [None]:
# Assuming 'cleaned_dataset' is the result from Step 2

print("\n--- Step 3: Handling Missing/Empty Values ---")

def remove_empty_examples(example):
    # Check if both 'article' and 'highlights' exist and are non-empty strings
    return bool(example['article'] and example['highlights'] and
                isinstance(example['article'], str) and isinstance(example['highlights'], str))

# Apply this filter to all splits
cleaned_dataset = cleaned_dataset.filter(remove_empty_examples)

print(f"Train size after removing empty examples: {len(cleaned_dataset['train'])}")
print(f"Validation size after removing empty examples: {len(cleaned_dataset['validation'])}")
print(f"Test size after removing empty examples: {len(cleaned_dataset['test'])}")

print("\nCleaned Dataset Structure (after empty examples removal):")
print(cleaned_dataset)

In [None]:
import re

# Assuming 'cleaned_dataset' is the result from Step 3

print("\n--- Step 4: Basic Text Normalization ---")

def normalize_text_fields(examples):
    # Process 'article'
    normalized_articles = []
    for text in examples['article']:
        text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
        text = text.strip()              # Remove leading/trailing whitespace
        normalized_articles.append(text)

    # Process 'highlights'
    normalized_highlights = []
    for text in examples['highlights']:
        text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
        text = text.strip()              # Remove leading/trailing whitespace
        normalized_highlights.append(text)

    return {'article': normalized_articles, 'highlights': normalized_highlights}

# Apply this normalization to all splits
cleaned_dataset = cleaned_dataset.map(normalize_text_fields, batched=True)

print(f"Example from train split after normalization (first 500 chars):")
print("Article:", cleaned_dataset['train'][0]['article'][:500] + "...")
print("Highlights:", cleaned_dataset['train'][0]['highlights'])

In [None]:
from transformers import AutoTokenizer

# Assuming 'cleaned_dataset' is the result from Step 4

print("\n--- Step 5: Tokenization ---")

# Choose your pre-trained model checkpoint.
# This choice dictates the tokenizer and the model architecture you'll use for fine-tuning.
# For summarization, common choices are "facebook/bart-large-cnn", "t5-base", or "google/pegasus-cnn_dailymail"
model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define maximum input and target lengths.
# These are typically derived from the model's architecture limits and dataset statistics.
# For BART/T5, max_input_length is often 1024. For CNN/DM highlights, 128-150 is usually sufficient.
max_input_length = 1024
max_target_length = 128

def tokenize_function(examples):
    # Tokenize the input articles
    # truncation=True: cuts off text longer than max_input_length
    # padding="max_length": pads shorter texts to max_input_length with zeros
    model_inputs = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # Tokenize the target summaries (highlights)
    # The 'labels' key is specifically expected by the Hugging Face Trainer for sequence-to-sequence models.
    labels = tokenizer(
        examples["highlights"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )

    # Assign the tokenized labels (input_ids) to the 'labels' key in model_inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the tokenization function to all splits of your dataset.
# 'remove_columns' removes the original text columns to save memory, as they are no longer needed.
tokenized_dataset = cleaned_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4, # Use multiple processes for faster tokenization (adjust based on CPU cores)
    remove_columns=cleaned_dataset['train'].column_names # Removes 'article', 'highlights', 'id'
)

print("\nTokenized Dataset Structure:")
print(tokenized_dataset)
print("\nFirst Tokenized Example (training split - showing input_ids and labels):")
print("Article Input IDs (first 20):", tokenized_dataset['train'][0]['input_ids'][:20])
print("Article Attention Mask (first 20):", tokenized_dataset['train'][0]['attention_mask'][:20])
print("Highlights Labels (first 20):", tokenized_dataset['train'][0]['labels'][:20])