---
title: Pegasus Evaluation Loop
author: "Josh Fernando"
---

## Setup

In [1]:
import torch
import pandas as pd
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from peft import PeftModel, PeftConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL_ID = "google/pegasus-large"

# Load Test Data
test_df = pd.read_csv('../data/processed/test.csv')

  from .autonotebook import tqdm as notebook_tqdm


## Generate Headlines

In [2]:
models = [
    {"name": "LoRA-8 Attention", "path": "../output/Pegasus/style_model_PEGASUS_r8A"},
    {"name": "LoRA-16 Attention", "path": "../output/Pegasus/style_model_PEGASUS_r16A"},
    {"name": "LoRA-32 Attention", "path": "../output/Pegasus/style_model_PEGASUS_r32A"},
    {"name": "LoRA-64 Attention", "path": "../output/Pegasus/style_model_PEGASUS_r64A"},
    {"name": "LoRA-8 Attention+FFN", "path": "../output/Pegasus/style_model_PEGASUS_r8AFFN"},
    {"name": "LoRA-16 Attention+FFN", "path": "../output/Pegasus/style_model_PEGASUS_r16AFFN"},
    {"name": "LoRA-32 Attention+FFN", "path": "../output/Pegasus/style_model_PEGASUS_r32AFFN"},
    {"name": "LoRA-64 Attention+FFN", "path": "../output/Pegasus/style_model_PEGASUS_r64AFFN"},
]

results_pred = dict()

print("Generating headlines...")
for pegasus in models:
    # Setup
    STYLE_ADAPTER_PATH = pegasus["path"]
    tokenizer = AutoTokenizer.from_pretrained(STYLE_ADAPTER_PATH)
    base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_ID)
    base_model.resize_token_embeddings(len(tokenizer))
    model = PeftModel.from_pretrained(base_model, STYLE_ADAPTER_PATH)
    model.to(DEVICE)
    model.eval()

    # Generation Loop
    results = []
    styles = ['neutral', 'punchy']
    style_tokens = {'neutral': '<neutral>', 'punchy': '<punchy>'}

    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        snippet = row['snippet']
        
        entry = {'snippet': snippet, 'ref_neutral': row['neutral'], 
                'ref_punchy': row['punchy']}
        
        # --- 1. Style-Controlled Model Generation ---
        for style in styles:
            input_text = f"{style_tokens[style]} {snippet}"
            inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
            
            with torch.no_grad():
                if style == 'punchy':
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=64,
                        do_sample=True,             # Use sampling
                        top_p=0.9,                  # Nucleus sampling
                        temperature=0.8,            # High temperature for creativity
                        num_return_sequences=1      # Ensure you only get one output
                    )
                else: # neutral style, stick to high-fidelity generation
                    outputs = model.generate(
                        **inputs, 
                        max_new_tokens=64,
                        num_beams=4,
                    )
            
            gen_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            entry[f'gen_{style}'] = gen_text
            
        # --- 2. Baseline Model Generation ---
        # Baseline uses just the snippet as input, no style token
        baseline_inputs = tokenizer(snippet, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            baseline_outputs = base_model.generate(**baseline_inputs, max_new_tokens=64) # Uses base_model
        
        baseline_text = tokenizer.decode(baseline_outputs[0], skip_special_tokens=True)
        entry['gen_baseline'] = baseline_text
        
        results.append(entry)

    results_df = pd.DataFrame(results)
    results_pred.update({pegasus["name"]:results_df})
    del results_df

Generating headlines...


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
100%|██████████| 64/64 [02:28<00:00,  2.32s/it]
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 64/64

In [3]:
for model_name in results_pred.keys():
    results_pred[model_name].to_csv(f"../data/predictions/Pegasus_{model_name}.csv", index=False)

## ROGUE Evaluation

In [4]:
# 3. Metric 1: ROUGE Score
rouge = evaluate.load('rouge')
results_rouge = dict()
print("\n--- ROUGE Scores ---")

for model in results_pred.keys():
    rouge_scores = pd.DataFrame(columns=["rouge_1", "rouge_2", "rouge_L", "rouge_Lsum"])
    results_df = results_pred[model]
    # Style-Controlled Model Scores (Existing)
    for style in styles:
        refs = results_df[f'ref_{style}'].tolist()
        preds = results_df[f'gen_{style}'].tolist()
        score = rouge.compute(predictions=preds, references=refs)
        # print(f"Style Model <{style.upper()}>: {score}")
        rouge_scores.loc[f"<{style.upper()}> vs. REF_{style.upper()}"] = score.values()

    # Baseline Model Scores (NEW)
    baseline_preds = results_df['gen_baseline'].tolist()

    # Compare Baseline against NEUTRAL reference
    refs_neutral = results_df['ref_neutral'].tolist()
    score_baseline_neutral = rouge.compute(predictions=baseline_preds, references=refs_neutral)
    # print(f"Baseline Model vs. REF_NEUTRAL: {score_baseline_neutral}")
    rouge_scores.loc[f"Baseline Model vs. REF_NEUTRAL"] = score_baseline_neutral.values()

    # Compare Baseline against PUNCHY reference
    refs_punchy = results_df['ref_punchy'].tolist()
    score_baseline_punchy = rouge.compute(predictions=baseline_preds, references=refs_punchy)
    # print(f"Baseline Model vs. REF_PUNCHY: {score_baseline_punchy}")
    rouge_scores.loc[f"Baseline Model vs. REF_PUNCHY"] = score_baseline_punchy.values()

    results_rouge.update({model:rouge_scores})


--- ROUGE Scores ---


In [5]:
for model_name in results_rouge.keys():
    results_rouge[model_name].to_csv(f"../data/rouge/Pegasus_{model_name}.csv", index=False)

## Classification

In [6]:
# 4. Metric 2: Style Accuracy (Simple Proxy Classifier)
# We train a quick classifier on the TRAIN set to act as our evaluator
print("\n--- Training Proxy Style Classifier ---")
train_df = pd.read_csv('../data/processed/train.csv')
train_texts = []
train_labels = []

# Unpack training data for classifier
for _, row in train_df.iterrows():
    train_texts.extend([row['neutral'], row['punchy']])
    train_labels.extend(['neutral', 'punchy'])

# 1. Split the human data for Classifier Training/Testing
X_classifier_train, X_classifier_test, y_classifier_train, y_classifier_test = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,5))
X_train_vec = vectorizer.fit_transform(X_classifier_train)
X_test_vec = vectorizer.transform(X_classifier_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_classifier_train)

# 2. Calculate Classifier Accuracy on its own test set
y_pred = clf.predict(X_test_vec)
classifier_acc = accuracy_score(y_classifier_test, y_pred)

print(f"Proxy Style Classifier Accuracy (on human data): {classifier_acc:.2%}")

# Evaluate Generated Headlines for All Models
results_classification = dict()
print("\n--- Evaluating Style Accuracy ---")

for model in results_pred.keys():
    classification_scores = pd.DataFrame(columns=["Accuracy", "Neutral_Pct", "Punchy_Pct"])
    results_df = results_pred[model]
    
    # Style-Controlled Model Evaluation
    for style in styles:
        gen_texts = results_df[f'gen_{style}'].tolist()
        X_test = vectorizer.transform(gen_texts)
        preds = clf.predict(X_test)
        
        # Calculate how many matched the requested style
        matches = [1 if p == style else 0 for p in preds]
        acc = sum(matches) / len(matches)
        
        # Calculate distribution
        from collections import Counter
        style_dist = Counter(preds)
        total = len(preds)
        neutral_pct = style_dist.get('neutral', 0) / total
        punchy_pct = style_dist.get('punchy', 0) / total
        
        classification_scores.loc[f"<{style.upper()}>"] = [acc, neutral_pct, punchy_pct]
    
    # Baseline Model Evaluation
    baseline_gen_texts = results_df['gen_baseline'].tolist()
    X_test_baseline = vectorizer.transform(baseline_gen_texts)
    preds_baseline = clf.predict(X_test_baseline)
    
    # Calculate the distribution of predicted styles for the baseline
    baseline_style_distribution = Counter(preds_baseline)
    total_count = len(preds_baseline)
    baseline_neutral_pct = baseline_style_distribution.get('neutral', 0) / total_count
    baseline_punchy_pct = baseline_style_distribution.get('punchy', 0) / total_count
    
    # For baseline, accuracy is not applicable (no target style), so we use None
    classification_scores.loc["Baseline"] = [None, baseline_neutral_pct, baseline_punchy_pct]
    
    results_classification.update({model: classification_scores})



--- Training Proxy Style Classifier ---
Proxy Style Classifier Accuracy (on human data): 59.32%

--- Evaluating Style Accuracy ---


In [7]:
for model_name in results_classification.keys():
    results_classification[model_name].to_csv(f"../data/classification/Pegasus_{model_name}.csv", index=True)

## Entailment

In [8]:
# 5. Metric 3: Factuality (NLI)
# Using a small NLI model to check entailment
print("\n--- Evaluating Factuality (NLI) ---")
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli", device=0 if DEVICE=="cuda" else -1)

def get_entailment_score(premise, hypothesis):
    # NLI input format: "Premise </s></s> Hypothesis" (model specific, but pipeline handles pairs usually)
    # roberta-large-mnli labels: CONTRADICTION, NEUTRAL, ENTAILMENT
    # We pass text_pair to pipeline
    result = nli_pipeline({'text': premise, 'text_pair': hypothesis})
    # We want to know if it is NOT contradiction, or strictly entailment
    # Let's track Entailment probability
    return result['score'] if result['label'] == 'ENTAILMENT' else 0.0

# Evaluate Generated Headlines for All Models
results_entailment = dict()
print("\n--- Evaluating Factuality (NLI) ---")

for model in results_pred.keys():
    entailment_scores = pd.DataFrame(columns=["Avg_Entailment_Score"])
    results_df = results_pred[model]
    
    # Style-Controlled Model Evaluation
    for style in styles:
        scores = []
        for _, row in results_df.iterrows():
            score = get_entailment_score(row['snippet'], row[f'gen_{style}'])
            scores.append(score)
        avg_fact = sum(scores)/len(scores)
        entailment_scores.loc[f"<{style.upper()}>"] = [avg_fact]
    
    # Baseline Model Evaluation
    baseline_scores = []
    for _, row in results_df.iterrows():
        score = get_entailment_score(row['snippet'], row['gen_baseline'])
        baseline_scores.append(score)
    avg_fact_baseline = sum(baseline_scores)/len(baseline_scores)
    entailment_scores.loc["Baseline"] = [avg_fact_baseline]
    
    results_entailment.update({model: entailment_scores})

print("\nEvaluation Complete!")


--- Evaluating Factuality (NLI) ---


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



--- Evaluating Factuality (NLI) ---


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Evaluation Complete!


In [9]:
for model_name in results_entailment.keys():
    results_entailment[model_name].to_csv(f"../data/entailment/Pegasus_{model_name}.csv", index=True)

## Identical Outputs

In [10]:
# 6. Metric 4: Identical Outputs
# Count how many times gen_neutral and gen_punchy are identical for each model
print("\n--- Evaluating Identical Outputs ---")

results_identical = dict()

for model in results_pred.keys():
    results_df = results_pred[model]
    
    # Count identical outputs
    identical_count = sum(results_df["gen_neutral"] == results_df["gen_punchy"])
    total_count = len(results_df)
    identical_percentage = (identical_count / total_count) * 100
    
    # Store results
    identical_scores = pd.DataFrame({
        "Count": [identical_count],
        "Total": [total_count],
        "Percentage": [identical_percentage]
    }, index=["Identical Outputs"])
    
    results_identical.update({model: identical_scores})
    
    print(f"{model}: {identical_count}/{total_count} ({identical_percentage:.2f}%)")

print("\nIdentical Outputs Evaluation Complete!")



--- Evaluating Identical Outputs ---
LoRA-8 Attention: 41/64 (64.06%)
LoRA-16 Attention: 44/64 (68.75%)
LoRA-32 Attention: 39/64 (60.94%)
LoRA-64 Attention: 46/64 (71.88%)
LoRA-8 Attention+FFN: 41/64 (64.06%)
LoRA-16 Attention+FFN: 42/64 (65.62%)
LoRA-32 Attention+FFN: 48/64 (75.00%)
LoRA-64 Attention+FFN: 47/64 (73.44%)

Identical Outputs Evaluation Complete!


In [11]:
for model_name in results_identical.keys():
    results_identical[model_name].to_csv(f"../data/identical_outputs/Pegasus_{model_name}.csv", index=True)