In [3]:
import torch
import pandas as pd
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from peft import PeftModel, PeftConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL_ID = "google/pegasus-large"
STYLE_ADAPTER_PATH = "../output/style_model_PEGASUS"

# 1. Load Resources
tokenizer = AutoTokenizer.from_pretrained(STYLE_ADAPTER_PATH)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_ID)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, STYLE_ADAPTER_PATH)
model.to(DEVICE)
model.eval()

# Load Test Data
test_df = pd.read_csv('../data/processed/test.csv')

# 2. Generation Loop
print("Generating headlines...")
results = []
styles = ['neutral', 'punchy']
style_tokens = {'neutral': '<neutral>', 'punchy': '<punchy>'}

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    snippet = row['snippet']
    
    entry = {'snippet': snippet, 'ref_neutral': row['neutral'], 
             'ref_punchy': row['punchy']}
    
    # --- 1. Style-Controlled Model Generation ---
    for style in styles:
        input_text = f"{style_tokens[style]} {snippet}"
        inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            if style == 'punchy':
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=64,
                    do_sample=True,             # Use sampling
                    top_p=0.9,                  # Nucleus sampling
                    temperature=0.8,            # High temperature for creativity
                    num_return_sequences=1      # Ensure you only get one output
                )
            else: # neutral style, stick to high-fidelity generation
                outputs = model.generate(
                    **inputs, 
                    max_new_tokens=64,
                    num_beams=4,
                )
        
        gen_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        entry[f'gen_{style}'] = gen_text
        
    # --- 2. Baseline Model Generation ---
    # Baseline uses just the snippet as input, no style token
    baseline_inputs = tokenizer(snippet, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        baseline_outputs = base_model.generate(**baseline_inputs, max_new_tokens=64) # Uses base_model
    
    baseline_text = tokenizer.decode(baseline_outputs[0], skip_special_tokens=True)
    entry['gen_baseline'] = baseline_text
    
    results.append(entry)

results_df = pd.DataFrame(results)

# 3. Metric 1: ROUGE Score
rouge = evaluate.load('rouge')
print("\n--- ROUGE Scores ---")

# Style-Controlled Model Scores (Existing)
for style in styles:
    refs = results_df[f'ref_{style}'].tolist()
    preds = results_df[f'gen_{style}'].tolist()
    score = rouge.compute(predictions=preds, references=refs)
    print(f"Style Model <{style.upper()}>: {score}")

# Baseline Model Scores (NEW)
baseline_preds = results_df['gen_baseline'].tolist()

# Compare Baseline against NEUTRAL reference
refs_neutral = results_df['ref_neutral'].tolist()
score_baseline_neutral = rouge.compute(predictions=baseline_preds, references=refs_neutral)
print(f"Baseline Model vs. REF_NEUTRAL: {score_baseline_neutral}")

# Compare Baseline against PUNCHY reference
refs_punchy = results_df['ref_punchy'].tolist()
score_baseline_punchy = rouge.compute(predictions=baseline_preds, references=refs_punchy)
print(f"Baseline Model vs. REF_PUNCHY: {score_baseline_punchy}")


# 4. Metric 2: Style Accuracy (Simple Proxy Classifier)
# We train a quick classifier on the TRAIN set to act as our evaluator
print("\n--- Training Proxy Style Classifier ---")
train_df = pd.read_csv('../data/processed/train.csv')
train_texts = []
train_labels = []

# Unpack training data for classifier
for _, row in train_df.iterrows():
    train_texts.extend([row['neutral'], row['punchy']])
    train_labels.extend(['neutral', 'punchy'])

# 1. Split the human data for Classifier Training/Testing
X_classifier_train, X_classifier_test, y_classifier_train, y_classifier_test = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,5))
X_train_vec = vectorizer.fit_transform(X_classifier_train)
X_test_vec = vectorizer.transform(X_classifier_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_classifier_train)

# 2. Calculate Classifier Accuracy on its own test set
y_pred = clf.predict(X_test_vec)
classifier_acc = accuracy_score(y_classifier_test, y_pred)

print(f"Proxy Style Classifier Accuracy (on human data): {classifier_acc:.2%}")

# Evaluate Generated Headlines
# Evaluate Generated Headlines
print("\n--- Evaluating Style Accuracy ---")

# Style-Controlled Model Evaluation (Existing)
for style in styles:
    gen_texts = results_df[f'gen_{style}'].tolist()
    X_test = vectorizer.transform(gen_texts)
    preds = clf.predict(X_test)
    
    # Calculate how many matched the requested style
    matches = [1 if p == style else 0 for p in preds]
    acc = sum(matches) / len(matches)
    print(f"Style Model Accuracy for requested style <{style}>: {acc:.2%}")

# Baseline Model Evaluation (NEW)
baseline_gen_texts = results_df['gen_baseline'].tolist()
X_test_baseline = vectorizer.transform(baseline_gen_texts)
preds_baseline = clf.predict(X_test_baseline)

# Calculate the distribution of predicted styles for the baseline
from collections import Counter
baseline_style_distribution = Counter(preds_baseline)
total_count = len(preds_baseline)

print("\nBaseline Model Style Distribution:")
for style, count in baseline_style_distribution.items():
    print(f"  Predicted <{style}>: {count / total_count:.2%}")

# The baseline model isn't requested to be a specific style, so you evaluate its bias.

# 5. Metric 3: Factuality (NLI)
# Using a small NLI model to check entailment
print("\n--- Evaluating Factuality (NLI) ---")
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli", device=0 if DEVICE=="cuda" else -1)

def get_entailment_score(premise, hypothesis):
    # NLI input format: "Premise </s></s> Hypothesis" (model specific, but pipeline handles pairs usually)
    # roberta-large-mnli labels: CONTRADICTION, NEUTRAL, ENTAILMENT
    # We pass text_pair to pipeline
    result = nli_pipeline({'text': premise, 'text_pair': hypothesis})
    # We want to know if it is NOT contradiction, or strictly entailment
    # Let's track Entailment probability
    return result['score'] if result['label'] == 'ENTAILMENT' else 0.0

print("\n--- Evaluating Factuality (NLI) ---")

# Style-Controlled Model Evaluation (Existing)
for style in styles:
    scores = []
    for _, row in results_df.iterrows():
        score = get_entailment_score(row['snippet'], row[f'gen_{style}'])
        scores.append(score)
    avg_fact = sum(scores)/len(scores)
    print(f"Style Model Average Entailment Score for <{style}>: {avg_fact:.4f}")

# Baseline Model Evaluation (NEW)
baseline_scores = []
for _, row in results_df.iterrows():
    score = get_entailment_score(row['snippet'], row['gen_baseline'])
    baseline_scores.append(score)
avg_fact_baseline = sum(baseline_scores)/len(baseline_scores)
print(f"Baseline Model Average Entailment Score: {avg_fact_baseline:.4f}")

print("\nEvaluation Complete!")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating headlines...


100%|██████████| 64/64 [02:27<00:00,  2.31s/it]



--- ROUGE Scores ---
Style Model <NEUTRAL>: {'rouge1': np.float64(0.5225898104277085), 'rouge2': np.float64(0.2924609627749377), 'rougeL': np.float64(0.47700224503886224), 'rougeLsum': np.float64(0.47749520313651944)}
Style Model <PUNCHY>: {'rouge1': np.float64(0.3903088243572036), 'rouge2': np.float64(0.17477048559896916), 'rougeL': np.float64(0.34522829690827084), 'rougeLsum': np.float64(0.34697396355222204)}
Baseline Model vs. REF_NEUTRAL: {'rouge1': np.float64(0.5041873390431714), 'rouge2': np.float64(0.2900979513984313), 'rougeL': np.float64(0.4613982541553338), 'rougeLsum': np.float64(0.46130009518084536)}
Baseline Model vs. REF_PUNCHY: {'rouge1': np.float64(0.37547813489239795), 'rouge2': np.float64(0.1709033225922213), 'rougeL': np.float64(0.3420045958200083), 'rougeLsum': np.float64(0.34152744607297775)}

--- Training Proxy Style Classifier ---
Proxy Style Classifier Accuracy (on human data): 59.32%

--- Evaluating Style Accuracy ---
Style Model Accuracy for requested style <

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



--- Evaluating Factuality (NLI) ---


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Style Model Average Entailment Score for <neutral>: 0.8971
Style Model Average Entailment Score for <punchy>: 0.8951
Baseline Model Average Entailment Score: 0.9250

Evaluation Complete!


In [4]:
results_df.sample(10)

Unnamed: 0,snippet,ref_neutral,ref_punchy,gen_neutral,gen_punchy,gen_baseline
28,Young gymnast Maxine Bondoc aims high as she g...,Maxine Bondoc prepares for Junior World Champi...,Dream big: Maxine Bondoc eyes gymnastics glory...,Maxine Bondoc gears up to compete at Junior Wo...,Maxine Bondoc gears up to compete at Junior Wo...,Young gymnast Maxine Bondoc aims high as she g...
31,The 2023 FIBA World Cup saw record-breaking at...,FIBA World Cup games in PH see high attendance,PH fans flood the arenas — World Cup crowds se...,PH’s FIBA World Cup fever continues: record-br...,Record-breaking attendance at FIBA World Cup 2023,Record-breaking World Cup attendance: Philippi...
11,Far Eastern University’s men’s volleyball team...,FEU men’s volleyball beats NU in big UAAP upset,FEU shocks champs: Tamaraws stun NU in straigh...,Far Eastern defeats NU in men’s volleyball,Far Eastern stuns NU in men’s volleyball,Far Eastern defeats NU in men’s volleyball
12,Sandro Muhlach filed a sexual assault complain...,Actor Sandro Muhlach accuses former GMA contra...,Sandro Muhlach breaks silence: Files shocking ...,Sandro Muhlach files sexual assault complaint ...,Sandro Muhlach files sexual assault complaint ...,Sandro Muhlach files sexual assault complaint ...
15,Actress Bea Alonzo opened up about how 2024 wa...,Bea Alonzo reflects on a challenging 2024,Bea Alonzo bares her soul — calls 2024 the har...,Bea Alonzo reflects on her toughest year yet,Bea Alonzo reflects on her toughest year yet,Actress Bea Alonzo opens up about how 2024 was...
38,Bella Belen of NU leads the MVP race in UAAP w...,Bella Belen tops stats for MVP in first round ...,Belen blazing: NU’s ace spiker leads MVP race ...,Bella Belen leads MVP race in UAAP women’s vol...,Bella Belen leads MVP race in UAAP women’s vol...,Bella Belen leads MVP race in UAAP women’s vol...
10,The UAAP named five “Athletes of the Year” for...,"UAAP announces five Athletes of the Year, tigh...",UAAP shakes things up: 5 Athletes of the Year ...,Five UAAP Athletes of the Year recognized for ...,Five UAAP athletes named “Athletes of the Year...,UAAP names five “Athletes of the Year” for Sea...
44,Alden Richards revealed he battled deep depres...,Alden Richards opens up about mental health st...,Alden Richards admits: “2024 was my lowest yea...,Alden Richards reveals he battled depression i...,Alden Richards reveals he battled depression i...,Alden Richards reveals he battled depression i...
53,The Police Criminal Investigation and Detecti...,College Dean Arrested in Maguindanao del Sur O...,Maguindanao Dean Nabbed for Murder of Village ...,College Dean Arrested in Maguindanao Murder Case,College Dean Arrested in Maguindanao Murder Case,CIDG Arrests College Dean in Maguindanao for M...
26,Vice Ganda expressed pride in his partner Ion ...,Vice Ganda proud of Ion Perez’s growth and con...,Vice beams over Ion’s new confidence—says part...,Vice Ganda gushes over partner Ion Perez’s gro...,Vice Ganda gushes over Ion Perez’s growth in s...,Vice also shared how Perez positively influenc...


In [5]:
sum(results_df["gen_neutral"] == results_df["gen_punchy"])

46