# NL2Declare: Internal Validation

This notebook performs 10-fold cross-validation on `data/internal_validation/data_crossvalidation.csv` and compares multiple decoder-only LLMs (7â€“8B class plus small Llama-3.2 1B/3B) with identical training hyperparameters, as described in the paper. Training is executed per fold and per model.

In [None]:
# %pip install -e ..[train]
import os, sys
from pathlib import Path
from statistics import mean

repo_root = Path(__file__).resolve().parents[1]
sys.path.append(str(repo_root / "src"))

from text2declare.training import TrainConfig, train
from text2declare.inference import load_model, load_peft_model, predict_constraints
from text2declare.parsing import extract_first_constraint
from text2declare.evaluation import evaluate


In [None]:
# Authentication and dataset path only (training/eval configured in CV cell)
HF_TOKEN = os.environ.get("HF_TOKEN", "")
CSV_PATH = str(repo_root / "data/internal_validation/data_crossvalidation.csv")


In [None]:
# 10-fold cross-validation (training per model and fold)
# Identical hyperparameters across models. Writes per-fold outputs to outputs/internal_cv/.

import csv
from sklearn.model_selection import KFold
import pandas as pd

NUM_FOLDS = 10
MODEL_IDS = [
    "google/gemma-7b",
    "mistralai/Mistral-7B-v0.1",
    "tiiuae/falcon-7b",
    "meta-llama/Llama-2-7b-hf",
    "meta-llama/Meta-Llama-3-8B",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.2-1B-Instruct",
]
HP = dict(per_device_train_batch_size=2, gradient_accumulation_steps=4, warmup_steps=2, max_steps=200, learning_rate=2e-4)

_df = pd.read_csv(CSV_PATH)
texts = _df.iloc[:, 0].tolist()
labels = _df.iloc[:, 1].tolist()

kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

results = []
for model_id in MODEL_IDS:
    fold_f1_a2 = []
    fold_f1_a4 = []
    fold_tacc = []
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(texts)):
        fold_dir = repo_root / f"outputs/internal_cv/{model_id.split('/')[-1]}_fold_{fold_idx}"
        train_csv = fold_dir / "train.csv"
        test_csv = fold_dir / "test.csv"
        fold_dir.mkdir(parents=True, exist_ok=True)

        with open(train_csv, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["Text Description", "Declare Constraint"])  # headers expected by loader
            for i in train_idx:
                w.writerow([texts[i], labels[i]])
        with open(test_csv, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["Text Description", "Declare Constraint"])  # GT for evaluation
            for i in test_idx:
                w.writerow([texts[i], labels[i]])

        cfg_fold = TrainConfig(model_id=model_id, train_csv=str(train_csv), output_dir=str(fold_dir), hf_token=HF_TOKEN, **HP)
        train(cfg_fold)

        # Load fine-tuned adapter and run inference (QA-style or few-shot format)
        model, tokenizer = load_peft_model(model_id, str(fold_dir / "adapter"), hf_token=HF_TOKEN)
        test_sentences = [texts[i] for i in test_idx]
        raw_out = predict_constraints(model, tokenizer, test_sentences, style="few_shot")
        parsed = [extract_first_constraint(o) or "" for o in raw_out]

        pred_csv = fold_dir / "pred.csv"
        with open(pred_csv, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["Text description", "Output (raw)"])
            for s, p in zip(test_sentences, parsed):
                w.writerow([s, p])

        # Compute metrics for alpha=2 and alpha=4; template accuracy derived from template errors
        p2, r2, f12, n2, te2 = evaluate(str(test_csv), str(pred_csv), alpha=2.0)
        p4, r4, f14, n4, te4 = evaluate(str(test_csv), str(pred_csv), alpha=4.0)
        tacc = 1.0 - (te2 / n2 if n2 else 0.0)
        fold_f1_a2.append(f12)
        fold_f1_a4.append(f14)
        fold_tacc.append(tacc)

    avg_f1_a2 = mean(fold_f1_a2) if fold_f1_a2 else 0.0
    avg_f1_a4 = mean(fold_f1_a4) if fold_f1_a4 else 0.0
    avg_tacc = mean(fold_tacc) if fold_tacc else 0.0
    results.append((model_id, avg_f1_a2, avg_f1_a4, avg_tacc))

results
