# LLM Security Results Viewer

This notebook loads the latest evaluation summaries from `outputs/` and shows tables and plots.
Run cells from top to bottom. It does **not** run training; it only visualizes results.


In [None]:
from pathlib import Path
import json
from IPython.display import display

try:
    import pandas as pd
except Exception:
    pd = None

base = Path(".")
outputs = base / "outputs"
summary_csv = outputs / "summary_table.csv"

if not outputs.exists():
    print("No outputs directory found. Run evaluations first.")
else:
    rows = []
    if summary_csv.exists() and pd is not None:
        df = pd.read_csv(summary_csv)
    else:
        for path in sorted(outputs.glob("summary_*.json")):
            with path.open("r", encoding="utf-8") as f:
                data = json.load(f)
            tag = data.get("experiment_tag") or path.stem.replace("summary_", "")
            defense = data.get("defense", "")
            config = data.get("config", "")
            for model_name, stats in data.get("models", {}).items():
                inj = stats.get("injection", {})
                ben = stats.get("benign", {})
                overall = stats.get("overall", {})
                rows.append(
                    {
                        "experiment_tag": tag,
                        "summary_file": str(path),
                        "model": model_name,
                        "defense": defense,
                        "config": config,
                        "injection_success_rate": inj.get("attack_success_rate", 0.0),
                        "benign_refusal_rate": ben.get("refusal_rate", 0.0),
                        "avg_latency_s": overall.get("avg_latency_s", 0.0),
                    }
                )
        df = pd.DataFrame(rows) if pd is not None else rows

    display(df)


In [None]:
if pd is None:
    print("pandas not available; table displayed as raw rows above.")
else:
    cols = [
        "experiment_tag",
        "model",
        "defense",
        "injection_success_rate",
        "benign_refusal_rate",
        "avg_latency_s",
    ]
    display(df[cols].sort_values(["experiment_tag", "model"]))

    display(
        df.groupby(["experiment_tag", "defense"])[
            ["injection_success_rate", "benign_refusal_rate", "avg_latency_s"]
        ]
        .mean()
        .sort_values("injection_success_rate", ascending=False)
    )


In [None]:
from IPython.display import Image, Markdown

plot_dir = outputs / "plots"
if not plot_dir.exists():
    print("No plots found. Run: python scripts/plot_results.py")
else:
    for img in sorted(plot_dir.glob("*.png")):
        display(Markdown(f"### {img.name}"))
        display(Image(filename=str(img)))


## Run new experiments (GPU)

Use Slurm on Triton (V100 16G):

```
sbatch --export=ALL,EVAL_CONFIG=configs/eval_small_phi3.yaml,DEFENSE=none scripts/run_eval_v100.sh
sbatch --export=ALL,EVAL_CONFIG=configs/eval_small_phi3.yaml,DEFENSE=filter_prefix scripts/run_eval_v100.sh
```

Then refresh plots:

```
python scripts/plot_results.py
```
