In [None]:
import torch
import pandas as pd
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from peft import PeftModel, PeftConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL_ID = "google/flan-t5-base"
STYLE_ADAPTER_PATH = "../output/style_model_FlanT5"

# 1. Load Resources
tokenizer = AutoTokenizer.from_pretrained(STYLE_ADAPTER_PATH)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_ID)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, STYLE_ADAPTER_PATH)
model.to(DEVICE)
model.eval()

# Load Test Data
test_df = pd.read_csv('../data/processed/test.csv')

# 2. Generation Loop
print("Generating headlines...")
results = []
styles = ['neutral', 'punchy']
style_tokens = {'neutral': '<neutral>', 'punchy': '<punchy>'}

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    snippet = row['snippet']
    
    entry = {'snippet': snippet, 'ref_neutral': row['neutral'], 
             'ref_punchy': row['punchy']}
    
    # --- 1. Style-Controlled Model Generation ---
    for style in styles:
        input_text = f"{style_tokens[style]} {snippet}"
        inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            if style == 'punchy':
                outputs = model.generate(
                    **inputs, 
                    max_new_tokens=64,
                    do_sample=True,             # Turn on sampling
                    top_p=0.9,                  # Nucleus sampling (most common)
                    temperature=0.8,            # Use a high temperature for creativity
                )
            else: # neutral style, stick to high-fidelity generation
                outputs = model.generate(
                    **inputs, 
                    max_new_tokens=64,
                    num_beams=4,
                )
        
        gen_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        entry[f'gen_{style}'] = gen_text
        
    # --- 2. Baseline Model Generation ---
    # Baseline uses just the snippet as input, no style token
    baseline_inputs = tokenizer(snippet, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        baseline_outputs = base_model.generate(**baseline_inputs, max_new_tokens=64) # Uses base_model
    
    baseline_text = tokenizer.decode(baseline_outputs[0], skip_special_tokens=True)
    entry['gen_baseline'] = baseline_text
    
    results.append(entry)

results_df = pd.DataFrame(results)

# 3. Metric 1: ROUGE Score
rouge = evaluate.load('rouge')
print("\n--- ROUGE Scores ---")

# Style-Controlled Model Scores (Existing)
for style in styles:
    refs = results_df[f'ref_{style}'].tolist()
    preds = results_df[f'gen_{style}'].tolist()
    score = rouge.compute(predictions=preds, references=refs)
    print(f"Style Model <{style.upper()}>: {score}")

# Baseline Model Scores (NEW)
baseline_preds = results_df['gen_baseline'].tolist()

# Compare Baseline against NEUTRAL reference
refs_neutral = results_df['ref_neutral'].tolist()
score_baseline_neutral = rouge.compute(predictions=baseline_preds, references=refs_neutral)
print(f"Baseline Model vs. REF_NEUTRAL: {score_baseline_neutral}")

# Compare Baseline against PUNCHY reference
refs_punchy = results_df['ref_punchy'].tolist()
score_baseline_punchy = rouge.compute(predictions=baseline_preds, references=refs_punchy)
print(f"Baseline Model vs. REF_PUNCHY: {score_baseline_punchy}")


# 4. Metric 2: Style Accuracy (Simple Proxy Classifier)
# We train a quick classifier on the TRAIN set to act as our evaluator
print("\n--- Training Proxy Style Classifier ---")
train_df = pd.read_csv('../data/processed/train.csv')
train_texts = []
train_labels = []

# Unpack training data for classifier
for _, row in train_df.iterrows():
    train_texts.extend([row['neutral'], row['punchy']])
    train_labels.extend(['neutral', 'punchy'])

# 1. Split the human data for Classifier Training/Testing
X_classifier_train, X_classifier_test, y_classifier_train, y_classifier_test = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,5))
X_train_vec = vectorizer.fit_transform(X_classifier_train)
X_test_vec = vectorizer.transform(X_classifier_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_classifier_train)

# 2. Calculate Classifier Accuracy on its own test set
y_pred = clf.predict(X_test_vec)
classifier_acc = accuracy_score(y_classifier_test, y_pred)

print(f"Proxy Style Classifier Accuracy (on human data): {classifier_acc:.2%}")

# Evaluate Generated Headlines
# Evaluate Generated Headlines
print("\n--- Evaluating Style Accuracy ---")

# Style-Controlled Model Evaluation (Existing)
for style in styles:
    gen_texts = results_df[f'gen_{style}'].tolist()
    X_test = vectorizer.transform(gen_texts)
    preds = clf.predict(X_test)
    
    # Calculate how many matched the requested style
    matches = [1 if p == style else 0 for p in preds]
    acc = sum(matches) / len(matches)
    print(f"Style Model Accuracy for requested style <{style}>: {acc:.2%}")

# Baseline Model Evaluation (NEW)
baseline_gen_texts = results_df['gen_baseline'].tolist()
X_test_baseline = vectorizer.transform(baseline_gen_texts)
preds_baseline = clf.predict(X_test_baseline)

# Calculate the distribution of predicted styles for the baseline
from collections import Counter
baseline_style_distribution = Counter(preds_baseline)
total_count = len(preds_baseline)

print("\nBaseline Model Style Distribution:")
for style, count in baseline_style_distribution.items():
    print(f"  Predicted <{style}>: {count / total_count:.2%}")

# The baseline model isn't requested to be a specific style, so you evaluate its bias.

# 5. Metric 3: Factuality (NLI)
# Using a small NLI model to check entailment
print("\n--- Evaluating Factuality (NLI) ---")
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli", device=0 if DEVICE=="cuda" else -1)

def get_entailment_score(premise, hypothesis):
    # NLI input format: "Premise </s></s> Hypothesis" (model specific, but pipeline handles pairs usually)
    # roberta-large-mnli labels: CONTRADICTION, NEUTRAL, ENTAILMENT
    # We pass text_pair to pipeline
    result = nli_pipeline({'text': premise, 'text_pair': hypothesis})
    # We want to know if it is NOT contradiction, or strictly entailment
    # Let's track Entailment probability
    return result['score'] if result['label'] == 'ENTAILMENT' else 0.0

print("\n--- Evaluating Factuality (NLI) ---")

# Style-Controlled Model Evaluation (Existing)
for style in styles:
    scores = []
    for _, row in results_df.iterrows():
        score = get_entailment_score(row['snippet'], row[f'gen_{style}'])
        scores.append(score)
    avg_fact = sum(scores)/len(scores)
    print(f"Style Model Average Entailment Score for <{style}>: {avg_fact:.4f}")

# Baseline Model Evaluation (NEW)
baseline_scores = []
for _, row in results_df.iterrows():
    score = get_entailment_score(row['snippet'], row['gen_baseline'])
    baseline_scores.append(score)
avg_fact_baseline = sum(baseline_scores)/len(baseline_scores)
print(f"Baseline Model Average Entailment Score: {avg_fact_baseline:.4f}")

print("\nEvaluation Complete!")

Generating headlines...


  0%|          | 0/64 [00:01<?, ?it/s]


AttributeError: type object 'T5ForConditionalGeneration' has no attribute 'transformers-community/group-beam-search'

In [4]:
results_df.sample(10)

Unnamed: 0,snippet,ref_neutral,ref_punchy,gen_neutral,gen_punchy,gen_baseline
5,The Senate has approved a 10-fold increase in ...,Senate Approves P1 Billion Budget Increase for...,Senate Boosts Tourism: P1 Billion for DOT Bran...,Senate approves 10-fold increase in DOT budget...,"DOT budget increases 10-fold for branding, mar...",Senate approves 10-fold increase in DOT budget...
62,The Philippine men’s volleyball team swept Afg...,PH men’s volleyball beats Afghanistan for firs...,Spikers make history: PH sweeps Afghanistan in...,Philippine men’s volleyball sweeps Afghanistan...,Philippines volleyball team wins Asiad in Afgh...,Philippine men’s volleyball sweeps Afghanistan...
0,By the time he stepped down from the country’s...,Former President Duterte's Net Worth Increased...,P13.2M Richer: Duterte's Net Worth Jumps After...,Rodrigo Duterte’s net worth stands at P37.305 ...,Rodrigo Duterte’s Net Worth Hits P37.305 Milli...,Rodrigo Duterte’s net worth rises to P37.305 m...
22,Organizers called on the Filipino spirit and u...,Campaign encourages national pride for PH FIBA...,PH rallies together — “Unite” for basketball’s...,FIBA World Cup organizers call on Filipino spi...,"PH 'Unite': Filipino spirit, unity boost FIBA ...",PH’s Unite: Filipinos to shine in FIBA World C...
61,Senator Jinggoy Estrada believes the K to 12 ...,Senator Estrada Expresses View that K to 12 Pr...,Estrada Slams K to 12: 'It's a Failure!',Jinggoy Estrada says K to 12 education program...,Estrada dreads K-12 education program: No sanity,Estrada: K to 12 education program is a failure
31,The 2023 FIBA World Cup saw record-breaking at...,FIBA World Cup games in PH see high attendance,PH fans flood the arenas — World Cup crowds se...,2023 FIBA World Cup sees record attendance,PH fans hold record attendance in FIBA World C...,PH’s FIBA World Cup attendance surges: PH’s PH...
43,An activist fishers’ group criticized the gove...,Fishers' Group Criticizes Three-Month Closed F...,"Fishers Slam Visayan Sea Ban, Warn of Hunger a...",Activist fishers’ group criticizes Visayan Sea...,Activist fishers’ group criticizes Visayan Sea...,Activist fishers’ group criticizes Visayan Sea...
56,ABS-CBN and GMA stars made surprising crossove...,ABS-CBN and GMA actors now cross networks in n...,Network rivals no more — ABS-CBN & GMA stars u...,ABS-CBN and GMA stars make surprising crossove...,ABS-CBN & GMA stars make dramatic crossovers a...,ABS-CBN and GMA stars make surprising crossove...
27,The “Homestretch” episode highlighted the 2025...,“Homestretch” episode covers Batang Pinoy 2025...,Youth in motion: Batang Pinoy 2025 featured in...,Homestretch spotlights Batang Pinoy sports in ...,Homestretch highlights Batang Pinoy sports in ...,Homestretch highlights Batang Pinoy sports in ...
1,Kanlaon Volcano on Negros Island released ash ...,Kanlaon Volcano Records New Ash Emission,Kanlaon Rumbles Anew: Ash Emission Reported,Kanlaon Volcano Releases Ash Again in Negros I...,Kanlaon Volcano Releases Ash Again in Negros I...,Kanlaon Volcano Releases Ash Again in Negros I...


In [5]:
sum(results_df["gen_neutral"] == results_df["gen_punchy"])

5