# NL2Declare: External Validation â€” Part A (Comparison to van der Aa 2019)

This notebook evaluates our fine-tuned model against the state-of-the-art approach by van der Aa (2019) on two datasets (V1 and V2), following the paper:
- Templates considered: Init, End, Precedence, Response
- Reported metric: Template accuracy (per-template and overall)
- Inference format: QA-style prompt with `max_new_tokens=15`
- Only fine-tuned adapters are supported here



In [None]:
# %pip install -e ..[train]
import os, sys, csv
from pathlib import Path
repo_root = Path(__file__).resolve().parents[1]
sys.path.append(str(repo_root / "src"))

from text2declare.inference import load_peft_model, predict_constraints_qa
from text2declare.parsing import extract_first_constraint

HF_TOKEN = os.environ.get("HF_TOKEN", "")
MODEL_ID = "google/gemma-7b"
ADAPTER_DIR = str(repo_root / "outputs/external_ft/adapter")  # change to your adapter path

# Choose dataset: V1 or V2 (both contain only Init, End, Precedence, Response)
TEST_CSV = str(repo_root / "data/external_validation/baseline_declare_extraction_van_der_Aa_2019/test_set_V1.csv")
# TEST_CSV = str(repo_root / "data/external_validation/baseline_declare_extraction_van_der_Aa_2019/test_set_V2.csv")

# Output path for our LLM predictions (Gemma7) to mirror repo structure
RESULTS_DIR = repo_root / "external_validation/part_a/results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# Load adapter
assert ADAPTER_DIR and len(ADAPTER_DIR) > 0
model, tokenizer = load_peft_model(MODEL_ID, ADAPTER_DIR, hf_token=HF_TOKEN)

# Read sentences and gold
sentences, gold = [], []
with open(TEST_CSV, newline="") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        sentences.append(row[0])
        gold.append(row[1])

raw_outputs = predict_constraints_qa(model, tokenizer, sentences, max_new_tokens=15)
parsed = [extract_first_constraint(o) or "" for o in raw_outputs]

# Write predictions for archival under part_a/results
out_name = "results_V1.csv" if TEST_CSV.endswith("V1.csv") else "results_V2.csv"
with open(RESULTS_DIR / out_name, "w", newline="") as f:
    w = csv.writer(f)
    w.writerow(["Text description", "Output (raw)"])
    for s, p in zip(sentences, parsed):
        w.writerow([s, p])

len(parsed), parsed[:5]


In [None]:
# Compute per-template and overall template accuracy (Init, End, Response, Precedence)
from collections import defaultdict

per_templ_total = defaultdict(int)
per_templ_correct = defaultdict(int)

for g, p in zip(gold, parsed):
    templ_g = g.split('(')[0]
    templ_p = p.split('(')[0] if p else ""
    per_templ_total[templ_g] += 1
    if templ_g == templ_p:
        per_templ_correct[templ_g] += 1

overall_total = sum(per_templ_total.values())
overall_correct = sum(per_templ_correct.values())

per_templ_acc = {k: (per_templ_correct[k] / per_templ_total[k] if per_templ_total[k] else 0.0) for k in per_templ_total}
overall_acc = overall_correct / overall_total if overall_total else 0.0

dataset_version = "V1" if TEST_CSV.endswith("V1.csv") else "V2"
print(f"=== Template Accuracy Results (Dataset {dataset_version}) ===")
print(f"Overall Template Accuracy: {overall_acc:.3f} ({overall_correct}/{overall_total})")
print("\nPer-template accuracy:")
for template in sorted(per_templ_acc.keys()):
    acc = per_templ_acc[template]
    count = per_templ_total[template]
    correct = per_templ_correct[template]
    print(f"  {template}: {acc:.3f} ({correct}/{count})")

print(f"\nPredictions saved to: {RESULTS_DIR / out_name}")
print("For comparison with saved outputs, use: python analysis/external_part_a_eval.py")
