---
title: Pegasus Training Loop
author: Josh Fernando
---

## Setup

In [1]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, set_seed
from peft import get_peft_model, LoraConfig, TaskType

import warnings
warnings.filterwarnings('ignore')

print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"PyTorch Version: {torch.__version__}")

# --- CONFIGURATION ---
SEED = 42 
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
set_seed(SEED) # <-- Set global seed here!


MODEL_ID = "google/pegasus-large"
MAX_INPUT_LEN = 256
MAX_TARGET_LEN = 64
LR = 2e-4
BATCH_SIZE = 8 # Adjust based on GPU VRAM
EPOCHS = 5
OUTPUT_DIR_BASE = "../output/baseline_model_PEGASUS"
# OUTPUT_DIR_STYLE = "../output/style_model_PEGASUS"

# 1. Load Data
data_files = {
    "train": "../data/processed/train.csv",
    "validation": "../data/processed/dev.csv"
}
dataset = load_dataset("csv", data_files=data_files)

# 2. Tokenizer Setup & Special Tokens
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
special_tokens = ["<neutral>", "<punchy>"]
tokenizer.add_tokens(special_tokens)

  from .autonotebook import tqdm as notebook_tqdm


GPU: NVIDIA GeForce RTX 4070 Laptop GPU
CUDA Version: 12.8
PyTorch Version: 2.9.1+cu128


2

## Preprocessing & LoRA Setup Functions

In [2]:
# --- PREPROCESSING FUNCTIONS ---

def preprocess_baseline(examples):
    # Baseline: Input = Snippet, Target = Neutral Headline (or random)
    inputs = examples['snippet']
    targets = examples['headline'] # Using actual headline as the default target for baseline
    
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LEN, truncation=True)
    labels = tokenizer(targets, max_length=MAX_TARGET_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def preprocess_style(examples):
    # Style Model: Input = <STYLE> + Snippet, Target = Specific Headline
    inputs = []
    targets = []
    
    styles = ['neutral', 'punchy']
    style_tokens = {'neutral': '<neutral>', 'punchy': '<punchy>'}
    
    for i in range(len(examples['snippet'])):
        for style in styles:
            # Construct Input: "<style> snippet"
            input_text = f"{style_tokens[style]} {examples['snippet'][i]}"
            # Construct Target: corresponding headline
            target_text = examples[f'{style}'][i]
            
            inputs.append(input_text)
            targets.append(target_text)
            
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LEN, truncation=True)
    labels = tokenizer(targets, max_length=MAX_TARGET_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 3. Create Processed Datasets
tokenized_baseline = dataset.map(preprocess_baseline, batched=True)
# Note: Style preprocessing expands dataset size (1 story -> 3 pairs), so we map carefully
# For simplicity in this script, we map then flatten, or use a custom generator. 
# Here is a simplified approach re-loading for style to ensure 1-to-many mapping:
def flatten_style_data(batch):
    new_rows = {'input': [], 'target': []}
    styles = {'neutral': '<neutral>', 'punchy': '<punchy>'}
    for i, snippet in enumerate(batch['snippet']):
        for style_name, token in styles.items():
            new_rows['input'].append(f"{token} {snippet}")
            new_rows['target'].append(batch[f'{style_name}'][i])
    return new_rows

style_dataset = dataset.map(flatten_style_data, batched=True, remove_columns=dataset['train'].column_names)
tokenized_style = style_dataset.map(lambda x: {
    'input_ids': tokenizer(x['input'], max_length=MAX_INPUT_LEN, truncation=True)['input_ids'],
    'labels': tokenizer(x['target'], max_length=MAX_TARGET_LEN, truncation=True)['input_ids']
}, batched=True)

# 4. LoRA Setup Function
def get_lora_model(rank=8, target_modules=['q_proj', 'v_proj']):
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
    model.resize_token_embeddings(len(tokenizer)) # Resize for new style tokens
    
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM, 
        inference_mode=False, 
        r=rank,
        lora_alpha=2*rank, 
        lora_dropout=0.1,
        target_modules=target_modules,)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model

## Train Baseline Model

In [3]:
# 5. Train Baseline
print("\n--- Training Baseline Model ---")
model_baseline = get_lora_model()
args_base = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR_BASE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    seed=SEED
)

trainer_base = Seq2SeqTrainer(
    model=model_baseline,
    args=args_base,
    train_dataset=tokenized_baseline["train"],
    eval_dataset=tokenized_baseline["validation"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_baseline),
)
trainer_base.train()
model_baseline.save_pretrained(OUTPUT_DIR_BASE)


--- Training Baseline Model ---


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


trainable params: 1,572,864 || all params: 572,371,968 || trainable%: 0.2748


Epoch,Training Loss,Validation Loss
1,4.3786,3.476131
2,3.0913,2.685733
3,2.7603,2.471124
4,2.8383,2.410931
5,2.6517,2.392241


## Train Style Models

In [4]:
style_configs = [
    {"directory": "../output/Pegasus/style_model_PEGASUS_r8A", "rank": 8, "target_modules": ['q_proj', 'v_proj']},
    {"directory": "../output/Pegasus/style_model_PEGASUS_r16A", "rank": 16, "target_modules": ['q_proj', 'v_proj']},
    {"directory": "../output/Pegasus/style_model_PEGASUS_r32A", "rank": 32, "target_modules": ["q_proj", "v_proj"]},
    {"directory": "../output/Pegasus/style_model_PEGASUS_r64A", "rank": 64, "target_modules": ["q_proj", "v_proj"]},
    
    {"directory": "../output/Pegasus/style_model_PEGASUS_r8AFFN", "rank": 8, "target_modules": ["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"]},
    {"directory": "../output/Pegasus/style_model_PEGASUS_r16AFFN", "rank": 16, "target_modules": ["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"]},
    {"directory": "../output/Pegasus/style_model_PEGASUS_r32AFFN", "rank": 32, "target_modules": ["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"]},
    {"directory": "../output/Pegasus/style_model_PEGASUS_r64AFFN", "rank": 64, "target_modules": ["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"]},
]

# 6. Train Style Models
print("\n--- Training Style-Controlled Models ---")
for config in style_configs:
    torch.cuda.empty_cache()
    set_seed(SEED)
    model_style = get_lora_model(rank=config["rank"], target_modules=config["target_modules"]) # Re-initialize fresh model
    args_style = Seq2SeqTrainingArguments(
        output_dir=config["directory"],
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        seed=SEED,
        data_seed=SEED,
    )

    trainer_style = Seq2SeqTrainer(
        model=model_style,
        args=args_style,
        train_dataset=tokenized_style["train"],
        eval_dataset=tokenized_style["validation"],
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_style),
    )
    trainer_style.train()
    model_style.save_pretrained(config["directory"])
    tokenizer.save_pretrained(config["directory"]) # Save tokenizer with new tokens

print("Training Complete!")


--- Training Style-Controlled Models ---


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,572,864 || all params: 572,371,968 || trainable%: 0.2748


Epoch,Training Loss,Validation Loss
1,3.7763,3.073614
2,3.1458,2.642569
3,2.9016,2.507086
4,2.8171,2.457183
5,2.9002,2.442826


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 3,145,728 || all params: 573,944,832 || trainable%: 0.5481


Epoch,Training Loss,Validation Loss
1,3.2663,2.805567
2,2.9387,2.491304
3,2.6563,2.398459
4,2.5886,2.356173
5,2.7545,2.349295


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,291,456 || all params: 577,090,560 || trainable%: 1.0902


Epoch,Training Loss,Validation Loss
1,2.9869,2.567326
2,2.7137,2.361155
3,2.4675,2.284897
4,2.4041,2.250947
5,2.5488,2.245611


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 12,582,912 || all params: 583,382,016 || trainable%: 2.1569


Epoch,Training Loss,Validation Loss
1,2.7256,2.428195
2,2.5889,2.296451
3,2.3417,2.227006
4,2.2638,2.209273
5,2.4396,2.206008


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 5,767,168 || all params: 576,566,272 || trainable%: 1.0003


Epoch,Training Loss,Validation Loss
1,3.6057,2.696144
2,2.8385,2.389969
3,2.5435,2.298279
4,2.4118,2.258296
5,2.579,2.251112


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 11,534,336 || all params: 582,333,440 || trainable%: 1.9807


Epoch,Training Loss,Validation Loss
1,3.1894,2.486214
2,2.5919,2.280167
3,2.3103,2.204924
4,2.2208,2.185128
5,2.3689,2.181582


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 23,068,672 || all params: 593,867,776 || trainable%: 3.8845


Epoch,Training Loss,Validation Loss
1,2.6867,2.378242
2,2.4463,2.207739
3,2.1178,2.161018
4,2.0615,2.145142
5,2.1634,2.1484


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 46,137,344 || all params: 616,936,448 || trainable%: 7.4785


Epoch,Training Loss,Validation Loss
1,2.5987,2.307817
2,2.331,2.181708
3,2.0533,2.144907
4,1.9563,2.138778
5,2.0132,2.152333


Training Complete!
