In [None]:
# ============================================================================
# PEGASUS CNN/DAILYMAIL MODEL FINE-TUNING CONFIGURATION
# ============================================================================
# This notebook fine-tunes a Pegasus model pre-trained on CNN/DailyMail dataset.
# The CNN/DailyMail variant is optimized for news article summarization.
#
# Configuration Parameters:
# - MODEL: Pre-trained Pegasus model (google/pegasus-cnn_dailymail)
# - OUT_DIR = # Directory to save model checkpoints and results
# - DRIVE_DATA_PATH: Path to dataset in Google Drive (UPDATE THIS!)
# - CLEAN_TEXT_COLUMN: Column name in CSV containing article text
# - SUMMARY_COLUMN: Column name in CSV containing reference summaries
# ============================================================================

MODEL = 'google/pegasus-large'
OUT_DIR = 'pegasus/10k_samples'
DRIVE_DATA_PATH = "/content/drive/MyDrive/processed/10k_samples"
CLEAN_TEXT_COLUMN = 'article'  
SUMMARY_COLUMN = 'highlights'

In [None]:
# ============================================================================
# OPTIONAL: GOOGLE CLOUD STORAGE (GCS) SETUP
# ============================================================================
# This cell is optional - only needed if you want to save/load models from GCS.
# If you're only using Google Drive, you can skip this cell.
#
# This sets up gcsfuse to mount a Google Cloud Storage bucket for model storage.
# ============================================================================

from google.colab import auth
auth.authenticate_user()

# Install gcsfuse (Google Cloud Storage FUSE - allows mounting GCS buckets as filesystem)
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# Create a local directory for mounting the GCS bucket
!mkdir -p pegasus

# Mount the GCS bucket
!gcsfuse --implicit-dirs pegasus_large_10k_2nd pegasus

In [None]:
# ============================================================================
# MOUNT GOOGLE DRIVE
# ============================================================================
# This cell mounts your Google Drive to access your dataset files.
# You'll be prompted to authorize access - follow the instructions.
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score
!pip install tqdm

In [None]:
import torch
import pprint
import evaluate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter

from transformers import (
    PegasusForConditionalGeneration,  Trainer, TrainingArguments,
    PegasusTokenizer,EarlyStoppingCallback,T5ForConditionalGeneration, T5Tokenizer,
    PegasusXForConditionalGeneration,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
)

from datasets import load_dataset

pp = pprint.PrettyPrinter()


In [None]:

def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  # Training configuration
  training_args = Seq2SeqTrainingArguments(
      output_dir=OUT_DIR,                 # Where to save checkpoints and outputs
      num_train_epochs=10,                # Total number of training epochs

      # Batch sizes (increased – adjust based on GPU memory)
      per_device_train_batch_size=20,     # Training batch size per GPU
      per_device_eval_batch_size=32,      # Evaluation batch size per GPU

      # Checkpointing
      save_strategy="epoch",              # Save model at the end of each epoch

      # Learning rate and optimization
      learning_rate=1e-4,                 # Learning rate for fine-tuning
      warmup_ratio=0.1,                   # Warmup over first 10% of training steps
      weight_decay=0.01,                  # L2 regularization
      gradient_accumulation_steps=8,      # Effective batch size = 20 × 8 = 160

      # Logging and evaluation
      logging_dir=f"./{OUT_DIR}/logs",        # TensorBoard log directory
      logging_steps=200,                  # Log every N steps
      logging_strategy="epoch",           # Also log at the end of each epoch
      evaluation_strategy="epoch",        # Evaluate after each epoch
      report_to="tensorboard",            # Enable TensorBoard logging

      # Generation settings (used during evaluation)
      predict_with_generate=True,         # Generate summaries during evaluation

      # Performance optimizations (A100 / modern GPUs)
      bf16=True,                          # Use bfloat16 precision
      fp16=False,                        # Disable fp16 (bf16 is more stable)
      tf32=True,                         # Enable TensorFloat-32 on Ampere GPUs
      torch_compile=True,                # Enable PyTorch compilation

      # Model selection
      metric_for_best_model="eval_loss",  # Select best model using validation loss
      greater_is_better=False,            # Lower loss is better
      load_best_model_at_end=True,        # Load best model after training
  )

  trainer = Seq2SeqTrainer(
      model=model,                  # Pretrained transformer model to be fine-tuned
      args=training_args,           # TrainingArguments defined above
      train_dataset=train_dataset,  # Dataset used for training
      eval_dataset=val_dataset,     # Dataset used for validation / evaluation
      tokenizer=tokenizer,          # Tokenizer for encoding and decoding text
      compute_metrics=compute_metrics,  # Function to compute evaluation metrics
      callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],# Stop training if validation loss does not improve
  )

  return trainer

In [None]:
# ============================================================================
# TENSORBOARD LOGGING, DATASET CLASSES, AND UTILITY FUNCTIONS
# ============================================================================
# This cell sets up TensorBoard logging, custom dataset class, GPU monitoring,
# data preparation functions, and evaluation metrics computation.
# ============================================================================

from torch.utils.tensorboard import SummaryWriter
from transformers import TrainerCallback
import time

# Initialize TensorBoard writer (logs will be saved to OUT_DIR)
writer = SummaryWriter(log_dir=OUT_DIR)
print(f"TensorBoard logs will be saved to: {OUT_DIR}")

class PegasusDataset(torch.utils.data.Dataset):
    """
    Custom PyTorch Dataset class for Pegasus model training.
    
    This class wraps tokenized encodings and labels into a format
    that PyTorch DataLoader can use for efficient batching.
    """
    def __init__(self, encodings, labels):
        """
        Args:
            encodings: Tokenized input texts (dictionary with 'input_ids', 'attention_mask')
            labels: Tokenized target summaries (dictionary with 'input_ids')
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Get a single training example.
        
        Returns:
            Dictionary with input_ids, attention_mask, and labels as tensors
        """
        # Convert encodings to tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add labels (target summaries) as tensors
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    
    def __len__(self):
        """Return the number of examples in the dataset."""
        return len(self.labels['input_ids'])

class GpuLoggerCallback(TrainerCallback):
    """
    Custom callback to log GPU memory usage during training.
    
    This helps monitor if you're running out of GPU memory and need to
    reduce batch size or other memory-intensive settings.
    """
    def __init__(self, writer):
        self.writer = writer

    def on_step_end(self, args, state, control, **kwargs):
        """
        Called after each training step.
        Logs current GPU memory usage to TensorBoard.
        """
        if torch.cuda.is_available():
            # Get current GPU memory usage in GB
            gpu_mem = torch.cuda.memory_allocated() / (1024 ** 3)
            # Log to TensorBoard
            self.writer.add_scalar("gpu_memory_gb", gpu_mem, state.global_step)
        return control

def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning.
  
  This function tokenizes articles and summaries, creating datasets
  ready for training.
  
  Args:
    model_name: Name of the Pegasus model (for tokenizer)
    train_texts: List of training article texts
    train_labels: List of training summary texts
    val_texts: List of validation article texts (optional)
    val_labels: List of validation summary texts (optional)
    test_texts: List of test article texts (optional)
    test_labels: List of test summary texts (optional)
    
  Returns:
    Tuple of (train_dataset, val_dataset, test_dataset, tokenizer)
  """
  # Load Pegasus tokenizer
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  # Check if validation and test sets are provided
  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    """
    Tokenize articles and summaries.
    
    Args:
      texts: List of article texts
      labels: List of summary texts
      
    Returns:
      PegasusDataset instance with tokenized data
    """
    # Tokenize inputs (articles)
    encodings = tokenizer(texts, truncation=True, padding=True)
    # Tokenize targets (summaries)
    decodings = tokenizer(labels, truncation=True, padding=True)
    # Create dataset
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  # Tokenize all datasets
  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer

def compute_metrics(eval_pred):
    """
    Compute ROUGE metrics for model evaluation.
    
    This function is called automatically during validation.
    It decodes predictions and references, then computes ROUGE scores.
    
    Args:
        eval_pred: Predictions and labels from the model
        
    Returns:
        Dictionary of metric scores (ROUGE-1, ROUGE-2, ROUGE-L, ROUGE-Lsum, gen_len)
    """
    # Load ROUGE metric
    rouge = evaluate.load("rouge")
    
    # Extract predictions and labels
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    # Decode predictions (convert token IDs back to text)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 (ignored tokens) with pad token before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    rouge_result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,  # Use stemming for better matching
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"]  # Compute these ROUGE variants
    )
    
    # Calculate average generated summary length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    rouge_result["gen_len"] = np.mean(prediction_lens)

    # Log metrics to TensorBoard for visualization
    for k, v in rouge_result.items():
        writer.add_scalar(f"eval/{k}", v, trainer.state.global_step)

    # Return metrics dictionary
    return {k: v for k, v in rouge_result.items()}

In [None]:
from datasets import Dataset
import pandas as pd

print("Loading data...")
train_df = pd.read_csv(f"{DRIVE_DATA_PATH}/train.csv").head(10000)
val_df = pd.read_csv(f"{DRIVE_DATA_PATH}/val.csv").head(2000)


dataset_train = Dataset.from_pandas(train_df)
dataset_valid = Dataset.from_pandas(val_df)
train_texts, train_labels = dataset_train[CLEAN_TEXT_COLUMN], dataset_train[SUMMARY_COLUMN]
valid_texts, valid_labels = dataset_valid[CLEAN_TEXT_COLUMN], dataset_valid[SUMMARY_COLUMN]


print("Train:", len(train_texts))
print("Val:", len(valid_texts))


train_dataset, val_dataset, test_dataset, tokenizer = prepare_data(MODEL, train_texts, train_labels, valid_texts, valid_labels)
trainer = prepare_fine_tuning(MODEL, tokenizer, train_dataset, val_dataset=val_dataset)
trainer.train()
trainer.add_callback(GpuLoggerCallback(writer))
