In [None]:
# ============================================================================
# PEGASUS CNN/DAILYMAIL MODEL FINE-TUNING CONFIGURATION
# ============================================================================
# This notebook fine-tunes a Pegasus model pre-trained on CNN/DailyMail dataset.
# The CNN/DailyMail variant is optimized for news article summarization.
#
# Configuration Parameters:
# - MODEL: Pre-trained Pegasus model (google/pegasus-cnn_dailymail)
# - OUT_DIR = # Directory to save model checkpoints and results_samples
# - DRIVE_DATA_PATH: Path to dataset in Google Drive (UPDATE THIS!)
# - CLEAN_TEXT_COLUMN: Column name in CSV containing article text
# - SUMMARY_COLUMN: Column name in CSV containing reference summaries
# ============================================================================

MODEL = 't5-base'
OUT_DIR = 't5_base_50k_final/50k_samples'
DRIVE_DATA_PATH = "/content/drive/MyDrive/processed/50k_samples_new"
CLEAN_TEXT_COLUMN='article'
SUMMARY_COLUMN='highlights'

In [None]:
# ============================================================================
# MOUNT GOOGLE DRIVE
# ============================================================================
# This cell mounts your Google Drive to access your dataset files.
# You'll be prompted to authorize access - follow the instructions.
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ============================================================================
# OPTIONAL: GOOGLE CLOUD STORAGE (GCS) SETUP
# ============================================================================
# This cell is optional - only needed if you want to save/load models from GCS.
# If you're only using Google Drive, you can skip this cell.
#
# This sets up gcsfuse to mount a Google Cloud Storage bucket for model storage.
# ============================================================================

from google.colab import auth
auth.authenticate_user()

# Install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# Create a local directory for mounting
!mkdir t5_base_50k_final

# Mount the GCS bucket
!gcsfuse --implicit-dirs t5_base_50k_final t5_base_50k_final

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score
!pip install tqdm
!pip install tensorboard-data-server
!pip install tbparse

In [None]:
import torch
import pprint
import evaluate
import numpy as np
import pandas as pd

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
    T5Config
)
from datasets import load_dataset

pp = pprint.PrettyPrinter()


In [None]:
# Load data from Google Drive
from datasets import Dataset

print("Loading data...")
train_df = pd.read_csv(f"{DRIVE_DATA_PATH}/train.csv")
val_df = pd.read_csv(f"{DRIVE_DATA_PATH}/val.csv").head(8000)
train_df = train_df.dropna(subset=['Summary', 'clean_text'])
val_df = val_df.dropna(subset=['Summary', 'clean_text'])
dataset_train = Dataset.from_pandas(train_df)
dataset_valid = Dataset.from_pandas(val_df)
print("Train:", len(train_df))
print("Val:", len(val_df))
# print("Test:", len(test_df))

In [None]:
# model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




# 1. Load the default configuration for T5-Base
config = T5Config.from_pretrained(MODEL)

# 2. Set the desired dropout rate
# Default for T5 is typically 0.1 (10%). To fight overfitting, you might increase it to 0.2 or 0.3.
NEW_DROPOUT_RATE = 0.2
config.dropout_rate = NEW_DROPOUT_RATE
config.attention_dropout_rate = NEW_DROPOUT_RATE # Also set for attention mechanisms

print(f"New dropout rate set to: {config.dropout_rate}")

# 3. Initialize the model using the modified configuration
model = T5ForConditionalGeneration.from_pretrained(
    MODEL,
    config=config
)

model.to(device)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.pad_token_id
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

rouge = evaluate.load("rouge")


In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
# ------------------------------------------------------------
# TensorBoard logging utilities
# ------------------------------------------------------------
from torch.utils.tensorboard import SummaryWriter
from transformers import TrainerCallback
import torch
import time

# TensorBoard writer initialization
# Used to log custom metrics (e.g., GPU memory usage)
writer = SummaryWriter(log_dir=OUT_DIR)

# Custom Trainer callback for GPU memory monitoring
class GpuLoggerCallback(TrainerCallback):
    """
    Custom Trainer callback that logs GPU memory usage
    to TensorBoard at the end of each training step.
    """

    def __init__(self, writer):
        self.writer = writer

    def on_step_end(self, args, state, control, **kwargs):
        """
        Called at the end of each training step.
        Logs allocated GPU memory (in GB) if CUDA is available.
        """

        # GPU memory logging
        if torch.cuda.is_available():
            gpu_mem_gb = torch.cuda.memory_allocated() / (1024 ** 3)
            self.writer.add_scalar(
                "gpu_memory_gb",          # Metric name in TensorBoard
                gpu_mem_gb,               # Current GPU memory usage
                state.global_step,        # Global training step
            )

        return control


In [None]:
# ------------------------------------------------------------
# Metric computation for evaluation (ROUGE + logging)
# ------------------------------------------------------------
def compute_metrics(eval_pred):
    """
    Compute ROUGE metrics for generated summaries and
    log results_samples to TensorBoard and CSV for later analysis.
    """

    # Extract predictions and labels
    predictions, labels = (eval_pred.predictions[0], eval_pred.label_ids,)

    # Decode generated summaries
    decoded_preds = tokenizer.batch_decode(predictions,skip_special_tokens=True)

    # Replace ignored label IDs (-100) with padding token
    labels = np.where(labels != -100,labels,tokenizer.pad_token_id,)
    decoded_labels = tokenizer.batch_decode(labels,skip_special_tokens=True,)

    # Compute ROUGE metrics
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=["rouge1", "rouge2", "rougeL"],
    )

    # Compute average generated sequence length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id)
        for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    # Log metrics to TensorBoard
    for metric_name, metric_value in result.items():
        writer.add_scalar(
            f"eval/{metric_name}",
            metric_value,
            trainer.state.global_step,
        )

    # Persist metrics to CSV (for offline analysis)
    pd.DataFrame([result]).to_csv(
        f"{OUT_DIR}/rouge_results_step_{trainer.state.global_step}.csv",
        index=False,
    )

    # Return rounded metrics to Trainer
    return {
        metric_name: round(metric_value, 4)
        for metric_name, metric_value in result.items()
    }


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=8, #todo to test this how much to use more means faster
    per_device_eval_batch_size=16,#todo to test this how much to use
    # warmup_steps=50,
    warmup_ratio = 0.1,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=200,
    gradient_accumulation_steps=8,
    logging_strategy="epoch",
    # eval_strategy='steps',
    eval_strategy='epoch',
    predict_with_generate=True,
    eval_steps=200,
    save_strategy='epoch',
    report_to='tensorboard',

    learning_rate=0.00005,
    dataloader_num_workers=4,
    bf16=True,
    fp16=False,
    tf32=True,
    save_total_limit=2,
    torch_compile=True,
    generation_num_beams=4,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.add_callback(GpuLoggerCallback(writer))
pd.DataFrame(trainer.state.log_history).to_csv(f"{OUT_DIR}/training_history.csv")

start = time.time()
history = trainer.train()
end = time.time()

writer.add_scalar("total_training_time_seconds", end - start, 0)

In [None]:
# Training arguments configuration
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,                 # Directory to save checkpoints and logs
    num_train_epochs=EPOCHS,            # Total number of training epochs

    # Batch sizes and data loading
    per_device_train_batch_size=8,      # Training batch size per GPU (tune for speed/memory)
    per_device_eval_batch_size=16,      # Evaluation batch size per GPU
    gradient_accumulation_steps=8,      # Effective batch size = 8 × 8 = 64
    dataloader_num_workers=4,           # Parallel data loading workers

    # Learning rate and optimization
    learning_rate=5e-5,                 # Learning rate for fine-tuning
    warmup_ratio=0.1,                   # Warmup over first 10% of training steps
    weight_decay=0.01,                  # L2 regularization

    # Logging and evaluation
    logging_dir=OUT_DIR,                # TensorBoard log directory
    logging_strategy="epoch",           # Also log at the end of each epoch
    evaluation_strategy="epoch",        # Evaluate after each epoch
    report_to="tensorboard",            # Enable TensorBoard logging
    predict_with_generate=True,         # Generate summaries during evaluation

    # Checkpointing
    save_strategy="epoch",              # Save model at the end of each epoch
    save_total_limit=2,                 # Keep only the most recent checkpoints

    # Generation configuration
    generation_num_beams=4,             # Beam size used during generation

    # Performance optimizations (modern GPUs)
    bf16=True,                          # Use bfloat16 precision
    fp16=False,                        # Disable fp16 (bf16 is more stable)
    tf32=True,                         # Enable TensorFloat-32 on Ampere GPUs
    torch_compile=True,                # Enable PyTorch compilation

    # Best model selection
    metric_for_best_model="eval_loss",  # Select best model using validation loss
    greater_is_better=False,            # Lower loss is better
    load_best_model_at_end=True,        # Load best model after training
)

# Trainer initialization
trainer = Trainer(
    model=model,                               # Model to be fine-tuned
    args=training_args,                        # Training configuration
    train_dataset=tokenized_train,             # Tokenized training dataset
    eval_dataset=tokenized_valid,              # Tokenized validation dataset
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Custom callbacks and logging
trainer.add_callback(GpuLoggerCallback(writer))

# Persist Trainer log history to CSV
pd.DataFrame(trainer.state.log_history).to_csv(
    f"{OUT_DIR}/training_history.csv",
    index=False,
)

# Training execution and timing
start = time.time()
history = trainer.train()
end = time.time()

# Log total training time to TensorBoard
writer.add_scalar("total_training_time_seconds"end - start,0)
