In [None]:
# ============================================================================
# PEGASUS CNN/DAILYMAIL MODEL FINE-TUNING CONFIGURATION
# ============================================================================
# This notebook fine-tunes a Pegasus model pre-trained on CNN/DailyMail dataset.
# The CNN/DailyMail variant is optimized for news article summarization.
#
# Configuration Parameters:
# - MODEL: Pre-trained Pegasus model (google/pegasus-cnn_dailymail)
# - OUT_DIR = # Directory to save model checkpoints and results
# - DRIVE_DATA_PATH: Path to dataset in Google Drive (UPDATE THIS!)
# - CLEAN_TEXT_COLUMN: Column name in CSV containing article text
# - SUMMARY_COLUMN: Column name in CSV containing reference summaries
# ============================================================================


MODEL = 'google/pegasus-cnn_dailymail'
OUT_DIR = 'pegasus/50k_samples'
DRIVE_DATA_PATH = "/content/drive/MyDrive/processed/50k_samples_new"   
CLEAN_TEXT_COLUMN='article'
SUMMARY_COLUMN='highlights'

In [None]:
# ============================================================================
# OPTIONAL: GOOGLE CLOUD STORAGE (GCS) SETUP
# ============================================================================
# This cell is optional - only needed if you want to save/load models from GCS.
# If you're only using Google Drive, you can skip this cell.
#
# This sets up gcsfuse to mount a Google Cloud Storage bucket for model storage.
# ============================================================================

from google.colab import auth
auth.authenticate_user()

# Install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse


# Create a local directory for mounting
!mkdir pegasus

# Mount the GCS bucket
!gcsfuse --implicit-dirs pegasus_cnn_daily_mail_50k_3rd_run pegasus

In [None]:
# ============================================================================
# MOUNT GOOGLE DRIVE
# ============================================================================
# This cell mounts your Google Drive to access your dataset files.
# You'll be prompted to authorize access - follow the instructions.
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score
!pip install tqdm

In [None]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    PegasusForConditionalGeneration,
    PegasusTokenizer,EarlyStoppingCallback,
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)

pp = pprint.PrettyPrinter()


In [None]:

def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

# Training configuration with validation
# (Medium–large dataset setup, e.g. 10K samples)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,                # Where to save checkpoints and outputs
    num_train_epochs=10,               # Total number of training epochs

    # Batch sizes (adjusted for dataset size and GPU memory)
    per_device_train_batch_size=16,    # Training batch size per GPU
    per_device_eval_batch_size=16,     # Evaluation batch size per GPU

    # Checkpointing
    save_total_limit=10,               # Keep only the most recent checkpoints
    save_strategy="epoch",             # Save model at the end of each epoch

    # Learning rate and optimization
    learning_rate=2e-5,                # Learning rate for fine-tuning
    warmup_ratio=0.1,                  # Warmup ratio (10% of training steps)
    weight_decay=0.01,                 # L2 regularization
    gradient_accumulation_steps=8,     # Effective batch size = 16 × 8 = 128

    # Logging and evaluation
    logging_dir="./pegasus/logs",       # TensorBoard log directory
    logging_strategy="epoch",          # Also log at the end of each epoch
    evaluation_strategy="epoch",       # Evaluate after each epoch
    report_to="tensorboard",           # Log metrics to TensorBoard

    # Generation settings (used during evaluation)
    predict_with_generate=True,        # Use text generation during evaluation

    # Performance optimizations (modern GPUs, e.g. A100)
    bf16=True,                         # Use bfloat16 precision
    fp16=False,                       # Disable fp16 (bf16 is more stable)
    tf32=True,                        # Enable TensorFloat-32 on Ampere GPUs
    torch_compile=True,               # Enable PyTorch compilation

    # Model selection
    metric_for_best_model="eval_loss", # Select best model using validation loss
    greater_is_better=False,           # Lower loss is better
    load_best_model_at_end=True,       # Load best model after training
)

  trainer = Seq2SeqTrainer(
    model=model,                  # Pretrained transformer model to be fine-tuned
    args=training_args,           # TrainingArguments defined above
    train_dataset=train_dataset,  # Dataset used for training
    eval_dataset=val_dataset,     # Dataset used for validation / evaluation
    tokenizer=tokenizer,          # Tokenizer used for encoding/decoding text
    compute_metrics=compute_metrics,  # Function to compute evaluation metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop training if eval loss doesn't improve
  )
  return trainer

In [None]:
from torch.utils.tensorboard import SummaryWriter
from transformers import TrainerCallback
import time
writer = SummaryWriter(log_dir=OUT_DIR)

class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)


    def on_step_end(self, args, state, control, **kwargs):
        if torch.cuda.is_available():
            gpu_mem = torch.cuda.memory_allocated() / (1024 ** 3)
        return control

class GpuLoggerCallback(TrainerCallback):
    def __init__(self, writer):
        self.writer = writer

    def on_step_end(self, args, state, control, **kwargs):
        if torch.cuda.is_available():
            gpu_mem = torch.cuda.memory_allocated() / (1024 ** 3)
            self.writer.add_scalar("gpu_memory_gb", gpu_mem, state.global_step)
        return control


def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer

def compute_metrics(eval_pred):
    rouge = evaluate.load("rouge")
    # predictions, labels = eval_pred.predictions[0], eval_pred.label_ids[0]
    predictions, labels = eval_pred.predictions, eval_pred.label_ids


    # Decode the predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE
    rouge_result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"]
    )
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    rouge_result["gen_len"] = np.mean(prediction_lens)

    # NEW: Log to TensorBoard
    for k, v in rouge_result.items():
        writer.add_scalar(f"eval/{k}", v, trainer.state.global_step)
    # Print rounded values
    # pprint.print({k: round(v, 4) for k, v in rouge_result.items()})

    # Must return a *dict*, not a set
    return {k: v for k, v in rouge_result.items()}

In [None]:
from datasets import Dataset
import pandas as pd

print("Loading data...")
train_df = pd.read_csv(f"{DRIVE_DATA_PATH}/train.csv").head(50000)
val_df = pd.read_csv(f"{DRIVE_DATA_PATH}/val.csv").head(10000)


dataset_train = Dataset.from_pandas(train_df)
dataset_valid = Dataset.from_pandas(val_df)
train_texts, train_labels = dataset_train[CLEAN_TEXT_COLUMN], dataset_train[SUMMARY_COLUMN]
valid_texts, valid_labels = dataset_valid[CLEAN_TEXT_COLUMN], dataset_valid[SUMMARY_COLUMN]

print("Train:", len(train_texts))
print("Val:", len(valid_texts))


train_dataset, val_dataset, test_dataset, tokenizer = prepare_data(MODEL, train_texts, train_labels, valid_texts, valid_labels)



In [None]:

trainer = prepare_fine_tuning(MODEL, tokenizer, train_dataset, val_dataset=val_dataset)
start = time.time()
trainer.add_callback(GpuLoggerCallback(writer))
trainer.train()
end = time.time()
print(f'time:  {start - end}')
