# NL2Declare: External Validation â€” Part B (Prompting Baselines vs Fine-Tuned)

This notebook compares our fine-tuned approach (QA format) against prompting baselines on the same set of templates as in the paper (Init, End, AtMostOne, AtLeastOne, Response, Precedence, ChainResponse, ChainPrecedence, RespondedExistence, CoExistence, NotCoExistence).

We report template accuracy and F1 as needed, but the paper emphasizes template accuracy for clean comparison.



In [None]:
# %pip install -e ..[train]
import os, sys, csv
from pathlib import Path
repo_root = Path(__file__).resolve().parents[1]
sys.path.append(str(repo_root / "src"))

from text2declare.inference import load_peft_model, predict_constraints_qa
from text2declare.parsing import extract_first_constraint
from text2declare.evaluation import evaluate

HF_TOKEN = os.environ.get("HF_TOKEN", "")
MODEL_ID = "google/gemma-7b"
ADAPTER_DIR = str(repo_root / "outputs/external_ft/adapter")  # change to your adapter path

# Use the external validation test set with 11 templates
TEST_CSV = str(repo_root / "data/external_validation/baselines_prompting/test_data.csv")

# Output path for our LLM predictions
RESULTS_DIR = repo_root / "external_validation/part_b/results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# Load adapter and run QA inference
assert ADAPTER_DIR and len(ADAPTER_DIR) > 0
model, tokenizer = load_peft_model(MODEL_ID, ADAPTER_DIR, hf_token=HF_TOKEN)

sentences, gold = [], []
with open(TEST_CSV, newline="") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        sentences.append(row[0])
        gold.append(row[1] if len(row) > 1 else "")

raw_outputs = predict_constraints_qa(model, tokenizer, sentences, max_new_tokens=15)
parsed = [extract_first_constraint(o) or "" for o in raw_outputs]

# Write predictions to part_b/results per repo convention
out_name = "results_test_data_LLM.csv"
with open(RESULTS_DIR / out_name, "w", newline="") as f:
    w = csv.writer(f)
    w.writerow(["Text description", "Output (raw)"])
    for s, p in zip(sentences, parsed):
        w.writerow([s, p])

len(parsed), parsed[:5]


In [None]:
# Quick template accuracy evaluation
from collections import defaultdict

per_templ_total = defaultdict(int)
per_templ_correct = defaultdict(int)
for g, p in zip(gold, parsed):
    templ_g = (g.split('(')[0] if g else "")
    templ_p = (p.split('(')[0] if p else "")
    if templ_g:
        per_templ_total[templ_g] += 1
        if templ_g == templ_p:
            per_templ_correct[templ_g] += 1

overall_total = sum(per_templ_total.values())
overall_correct = sum(per_templ_correct.values())
per_templ_acc = {k: (per_templ_correct[k] / per_templ_total[k] if per_templ_total[k] else 0.0) for k in per_templ_total}
overall_acc = overall_correct / overall_total if overall_total else 0.0

print(f"Template Accuracy: {overall_acc:.3f} ({overall_correct}/{overall_total})")
print("\nPer-template accuracy:")
for template, acc in sorted(per_templ_acc.items()):
    count = per_templ_total[template]
    correct = per_templ_correct[template]
    print(f"  {template}: {acc:.3f} ({correct}/{count})")

print(f"\nPredictions saved to: {RESULTS_DIR / 'results_test_data_LLM.csv'}")
print("For comprehensive metrics comparison, use: python analysis/external_part_b_eval.py")
