In [1]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, set_seed
from peft import get_peft_model, LoraConfig, TaskType

import warnings
warnings.filterwarnings('ignore')
# --- CONFIGURATION ---
SEED = 42 
set_seed(SEED) # <-- Set global seed here!
MODEL_ID = "google/pegasus-large"
MAX_INPUT_LEN = 256
MAX_TARGET_LEN = 64
LR = 2e-4
BATCH_SIZE = 8 # Adjust based on GPU VRAM
EPOCHS = 5
OUTPUT_DIR_BASE = "../output/baseline_model_PEGASUS"
OUTPUT_DIR_STYLE = "../output/style_model_PEGASUS"

# 1. Load Data
data_files = {
    "train": "../data/processed/train.csv",
    "validation": "../data/processed/dev.csv"
}
dataset = load_dataset("csv", data_files=data_files)

# 2. Tokenizer Setup & Special Tokens
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
special_tokens = ["<neutral>", "<punchy>"]
tokenizer.add_tokens(special_tokens)

# --- PREPROCESSING FUNCTIONS ---

def preprocess_baseline(examples):
    # Baseline: Input = Snippet, Target = Neutral Headline (or random)
    inputs = examples['snippet']
    targets = examples['headline'] # Using actual headline as the default target for baseline
    
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LEN, truncation=True)
    labels = tokenizer(targets, max_length=MAX_TARGET_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def preprocess_style(examples):
    # Style Model: Input = <STYLE> + Snippet, Target = Specific Headline
    inputs = []
    targets = []
    
    styles = ['neutral', 'punchy']
    style_tokens = {'neutral': '<neutral>', 'punchy': '<punchy>'}
    
    for i in range(len(examples['snippet'])):
        for style in styles:
            # Construct Input: "<style> snippet"
            input_text = f"{style_tokens[style]} {examples['snippet'][i]}"
            # Construct Target: corresponding headline
            target_text = examples[f'{style}'][i]
            
            inputs.append(input_text)
            targets.append(target_text)
            
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LEN, truncation=True)
    labels = tokenizer(targets, max_length=MAX_TARGET_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 3. Create Processed Datasets
tokenized_baseline = dataset.map(preprocess_baseline, batched=True)
# Note: Style preprocessing expands dataset size (1 story -> 3 pairs), so we map carefully
# For simplicity in this script, we map then flatten, or use a custom generator. 
# Here is a simplified approach re-loading for style to ensure 1-to-many mapping:
def flatten_style_data(batch):
    new_rows = {'input': [], 'target': []}
    styles = {'neutral': '<neutral>', 'punchy': '<punchy>'}
    for i, snippet in enumerate(batch['snippet']):
        for style_name, token in styles.items():
            new_rows['input'].append(f"{token} {snippet}")
            new_rows['target'].append(batch[f'{style_name}'][i])
    return new_rows

style_dataset = dataset.map(flatten_style_data, batched=True, remove_columns=dataset['train'].column_names)
tokenized_style = style_dataset.map(lambda x: {
    'input_ids': tokenizer(x['input'], max_length=MAX_INPUT_LEN, truncation=True)['input_ids'],
    'labels': tokenizer(x['target'], max_length=MAX_TARGET_LEN, truncation=True)['input_ids']
}, batched=True)

# 4. LoRA Setup Function
def get_lora_model():
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
    model.resize_token_embeddings(len(tokenizer)) # Resize for new style tokens
    
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM, 
        inference_mode=False, 
        r=64, # Increased rank as recommended 
        lora_alpha=128, 
        lora_dropout=0.1, # Specify T5/Flan-T5 attention layers 
        target_modules=[
            "q_proj", 
            "v_proj", 
            "k_proj", 
            "out_proj", 
            "fc1", 
            "fc2" # Adding FFN layers
        ],)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model

# 5. Train Baseline
print("\n--- Training Baseline Model ---")
model_baseline = get_lora_model()
args_base = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR_BASE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    seed=SEED
)

trainer_base = Seq2SeqTrainer(
    model=model_baseline,
    args=args_base,
    train_dataset=tokenized_baseline["train"],
    eval_dataset=tokenized_baseline["validation"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_baseline),
)
trainer_base.train()
model_baseline.save_pretrained(OUTPUT_DIR_BASE)

# 6. Train Style Model
print("\n--- Training Style-Controlled Model ---")
model_style = get_lora_model() # Re-initialize fresh model
args_style = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR_STYLE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    seed=SEED
)

trainer_style = Seq2SeqTrainer(
    model=model_style,
    args=args_style,
    train_dataset=tokenized_style["train"],
    eval_dataset=tokenized_style["validation"], # Note: In real research, ensure validation doesn't overlap
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_style),
)
trainer_style.train()
model_style.save_pretrained(OUTPUT_DIR_STYLE)
tokenizer.save_pretrained(OUTPUT_DIR_STYLE) # Save tokenizer with new tokens

print("Training Complete!")

  from .autonotebook import tqdm as notebook_tqdm



--- Training Baseline Model ---


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


trainable params: 46,137,344 || all params: 616,936,448 || trainable%: 7.4785


Epoch,Training Loss,Validation Loss
1,2.8603,2.251956
2,2.3221,2.142944
3,2.0271,2.102365
4,2.0206,2.063111
5,1.9588,2.062903



--- Training Style-Controlled Model ---


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 46,137,344 || all params: 616,936,448 || trainable%: 7.4785


Epoch,Training Loss,Validation Loss
1,2.5467,2.280951
2,2.3609,2.17741
3,1.9827,2.118946
4,1.9005,2.11726
5,2.0106,2.129208


Training Complete!
