In [1]:
import torch
import pandas as pd
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from peft import PeftModel, PeftConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL_ID = "facebook/bart-base"
STYLE_ADAPTER_PATH = "../output/style_model"

# 1. Load Resources
tokenizer = AutoTokenizer.from_pretrained(STYLE_ADAPTER_PATH)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_ID)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, STYLE_ADAPTER_PATH)
model.to(DEVICE)
model.eval()

# Load Test Data
test_df = pd.read_csv('../data/processed/test.csv')

# 2. Generation Loop
print("Generating headlines...")
results = []
styles = ['neutral', 'punchy']
style_tokens = {'neutral': '<neutral>', 'punchy': '<punchy>'}

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    snippet = row['snippet']
    
    entry = {'snippet': snippet, 'ref_neutral': row['neutral'], 
             'ref_punchy': row['punchy']}
    
    # --- 1. Style-Controlled Model Generation ---
    for style in styles:
        input_text = f"{style_tokens[style]} {snippet}"
        inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=64) # Uses PeftModel 'model'
        
        gen_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        entry[f'gen_{style}'] = gen_text
        
    # --- 2. Baseline Model Generation ---
    # Baseline uses just the snippet as input, no style token
    baseline_inputs = tokenizer(snippet, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        baseline_outputs = base_model.generate(**baseline_inputs, max_new_tokens=64) # Uses base_model
    
    baseline_text = tokenizer.decode(baseline_outputs[0], skip_special_tokens=True)
    entry['gen_baseline'] = baseline_text
    
    results.append(entry)

results_df = pd.DataFrame(results)

# 3. Metric 1: ROUGE Score
rouge = evaluate.load('rouge')
print("\n--- ROUGE Scores ---")

# Style-Controlled Model Scores (Existing)
for style in styles:
    refs = results_df[f'ref_{style}'].tolist()
    preds = results_df[f'gen_{style}'].tolist()
    score = rouge.compute(predictions=preds, references=refs)
    print(f"Style Model <{style.upper()}>: {score}")

# Baseline Model Scores (NEW)
baseline_preds = results_df['gen_baseline'].tolist()

# Compare Baseline against NEUTRAL reference
refs_neutral = results_df['ref_neutral'].tolist()
score_baseline_neutral = rouge.compute(predictions=baseline_preds, references=refs_neutral)
print(f"Baseline Model vs. REF_NEUTRAL: {score_baseline_neutral}")

# Compare Baseline against PUNCHY reference
refs_punchy = results_df['ref_punchy'].tolist()
score_baseline_punchy = rouge.compute(predictions=baseline_preds, references=refs_punchy)
print(f"Baseline Model vs. REF_PUNCHY: {score_baseline_punchy}")


# 4. Metric 2: Style Accuracy (Simple Proxy Classifier)
# We train a quick classifier on the TRAIN set to act as our evaluator
print("\n--- Training Proxy Style Classifier ---")
train_df = pd.read_csv('../data/processed/train.csv')
train_texts = []
train_labels = []

# Unpack training data for classifier
for _, row in train_df.iterrows():
    train_texts.extend([row['neutral'], row['punchy']])
    train_labels.extend(['neutral', 'punchy'])

# 1. Split the human data for Classifier Training/Testing
X_classifier_train, X_classifier_test, y_classifier_train, y_classifier_test = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,3))
X_train_vec = vectorizer.fit_transform(X_classifier_train)
X_test_vec = vectorizer.transform(X_classifier_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_classifier_train)

# 2. Calculate Classifier Accuracy on its own test set
y_pred = clf.predict(X_test_vec)
classifier_acc = accuracy_score(y_classifier_test, y_pred)

print(f"Proxy Style Classifier Accuracy (on human data): {classifier_acc:.2%}")

# Evaluate Generated Headlines
# Evaluate Generated Headlines
print("\n--- Evaluating Style Accuracy ---")

# Style-Controlled Model Evaluation (Existing)
for style in styles:
    gen_texts = results_df[f'gen_{style}'].tolist()
    X_test = vectorizer.transform(gen_texts)
    preds = clf.predict(X_test)
    
    # Calculate how many matched the requested style
    matches = [1 if p == style else 0 for p in preds]
    acc = sum(matches) / len(matches)
    print(f"Style Model Accuracy for requested style <{style}>: {acc:.2%}")

# Baseline Model Evaluation (NEW)
baseline_gen_texts = results_df['gen_baseline'].tolist()
X_test_baseline = vectorizer.transform(baseline_gen_texts)
preds_baseline = clf.predict(X_test_baseline)

# Calculate the distribution of predicted styles for the baseline
from collections import Counter
baseline_style_distribution = Counter(preds_baseline)
total_count = len(preds_baseline)

print("\nBaseline Model Style Distribution:")
for style, count in baseline_style_distribution.items():
    print(f"  Predicted <{style}>: {count / total_count:.2%}")

# The baseline model isn't requested to be a specific style, so you evaluate its bias.

# 5. Metric 3: Factuality (NLI)
# Using a small NLI model to check entailment
print("\n--- Evaluating Factuality (NLI) ---")
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli", device=0 if DEVICE=="cuda" else -1)

def get_entailment_score(premise, hypothesis):
    # NLI input format: "Premise </s></s> Hypothesis" (model specific, but pipeline handles pairs usually)
    # roberta-large-mnli labels: CONTRADICTION, NEUTRAL, ENTAILMENT
    # We pass text_pair to pipeline
    result = nli_pipeline({'text': premise, 'text_pair': hypothesis})
    # We want to know if it is NOT contradiction, or strictly entailment
    # Let's track Entailment probability
    return result['score'] if result['label'] == 'ENTAILMENT' else 0.0

print("\n--- Evaluating Factuality (NLI) ---")

# Style-Controlled Model Evaluation (Existing)
for style in styles:
    scores = []
    for _, row in results_df.iterrows():
        score = get_entailment_score(row['snippet'], row[f'gen_{style}'])
        scores.append(score)
    avg_fact = sum(scores)/len(scores)
    print(f"Style Model Average Entailment Score for <{style}>: {avg_fact:.4f}")

# Baseline Model Evaluation (NEW)
baseline_scores = []
for _, row in results_df.iterrows():
    score = get_entailment_score(row['snippet'], row['gen_baseline'])
    baseline_scores.append(score)
avg_fact_baseline = sum(baseline_scores)/len(baseline_scores)
print(f"Baseline Model Average Entailment Score: {avg_fact_baseline:.4f}")

print("\nEvaluation Complete!")

  from .autonotebook import tqdm as notebook_tqdm
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Generating headlines...


100%|██████████| 64/64 [00:45<00:00,  1.39it/s]



--- ROUGE Scores ---
Style Model <NEUTRAL>: {'rouge1': np.float64(0.502532105701431), 'rouge2': np.float64(0.2956227165714742), 'rougeL': np.float64(0.4693998035584849), 'rougeLsum': np.float64(0.4721227249340175)}
Style Model <PUNCHY>: {'rouge1': np.float64(0.363915789401218), 'rouge2': np.float64(0.14117201790514883), 'rougeL': np.float64(0.330163377708813), 'rougeLsum': np.float64(0.33049958740082525)}
Baseline Model vs. REF_NEUTRAL: {'rouge1': np.float64(0.5066343227906458), 'rouge2': np.float64(0.2925035083656353), 'rougeL': np.float64(0.4757419219694454), 'rougeLsum': np.float64(0.4777165100941265)}
Baseline Model vs. REF_PUNCHY: {'rouge1': np.float64(0.3594548868017611), 'rouge2': np.float64(0.13046980835093194), 'rougeL': np.float64(0.3261771492790007), 'rougeLsum': np.float64(0.32527948570223325)}

--- Training Proxy Style Classifier ---
Proxy Style Classifier Accuracy (on human data): 57.63%

--- Evaluating Style Accuracy ---
Style Model Accuracy for requested style <neutral

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



--- Evaluating Factuality (NLI) ---
Style Model Average Entailment Score for <neutral>: 0.9050
Style Model Average Entailment Score for <punchy>: 0.9050
Baseline Model Average Entailment Score: 0.9415

Evaluation Complete!


In [15]:
results_df.sample(10)

Unnamed: 0,snippet,ref_neutral,ref_punchy,gen_neutral,gen_punchy,gen_baseline
8,Rappler highlights the rise of Filipina athlet...,Filipina athletes shine in 2022 sports season,Reigning queens: Filipina athletes dominate 2022,"Filipinas rise: Hidilyn Diaz dominates, Alex E...","Filipinas rise: Hidilyn Diaz dominates, Alex E...",Rappler highlights the rise of Filipina athlet...
55,Tropical Depression Verbena made landfall over...,Tropical Depression Verbena Makes Landfall in ...,Verbena Hits Surigao del Sur,V Verbena Makes landfall in Agusan del Norte,V Verbena Makes landfall in Agusan del Norte,Tropical Depression Verbena makes landfall ove...
47,"In a knockout-phase clash in the PVL, Myla Pab...",Bagyong Pablo leads Petro Gazz past Creamline ...,Bagyong Pablo storms in: Petro Gazz knocks out...,Myla Pablo shines in PVL knockout-phase clash ...,Myla Pablo shines in PVL knockout-phase clash ...,Myla Pablo shines in PVL knockout-phase clash
51,Former Eat Bulaga co-hosts such as Cindy Kurle...,A look at former Eat Bulaga hosts and their ca...,Dabarkads then and now: Former Eat Bulaga host...,"Cindy Kurleto, Nova Villa, and Mickey Ferriols...","Cindy Kurleto, Nova Villa, and Mickey Ferriols...",Former Eat Bulaga co-hosts reminisce about the...
18,A town councilor died in a road crash along th...,Zamboanga del Norte Town Councilor Killed in R...,Tragedy: Town Councilor Dies in Zamboanga Road...,Town Councilor Dies in Road Crash in Zamboanga...,Town Councilor Dies in Road Crash in Zamboanga...,A town councilor dies in road crash in Zamboan...
53,The Police Criminal Investigation and Detecti...,College Dean Arrested in Maguindanao del Sur O...,Maguindanao Dean Nabbed for Murder of Village ...,CIDG arrests college dean in Maguindanao del S...,CIDG arrests college dean in Maguindanao del S...,CIDG Arrests College Dean in Maguindanao del Sur
9,The Trillion Peso March Movement (TPMM) is set...,Organizers Expect Higher Turnout for November ...,Corruption Outrage: Bigger Turnout Expected fo...,TPMM Set to Hold Another Rally Against Corruption,TPMM Set to Hold Another Rally Against Corruption,TPMM to hold another rally against corruption
21,FEU’s middle blockers and role players stepped...,FEU climbs to second in UAAP women’s volleybal...,FEU surges: middle blockers power them to No. ...,FEU’s middle blockers and role players shine i...,FEU’s middle blockers and role players shine i...,FEU’s middle blockers and role players shine i...
43,An activist fishers’ group criticized the gove...,Fishers' Group Criticizes Three-Month Closed F...,"Fishers Slam Visayan Sea Ban, Warn of Hunger a...",Fishers’ Groups Slam Visayan Sea Fishing Season,Fishers’ Groups Slam Visayan Sea Fishing Season,An activist fishers’ group slams the Visayan S...
25,The Philippines has been awarded hosting right...,PH named host for 2027 FIBA Women’s Asia Cup,Women’s hoops power: PH to host FIBA Asia Cup ...,PH gets hosting rights for FIBA Women’s Asia C...,PH gets hosting rights for FIBA Women’s Asia C...,The Philippines has been awarded hosting right...


In [2]:
sum(results_df["gen_neutral"] == results_df["gen_punchy"])

64