In [None]:
#!/usr/bin/env python3

import argparse
import json
from pathlib import Path
from typing import Optional

import pandas as pd
from deepeval.metrics import FaithfulnessMetric, GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

from toxtempass import LLM_API_KEY, config


def build_faithfulness_metric(
    *, model: str, threshold: float, verbose_mode: bool
) -> FaithfulnessMetric:
    # FaithfulnessMetric expects retrieval_context in the LLMTestCase.  [oai_citation:4‡deepeval.com](https://deepeval.com/docs/metrics-faithfulness?utm_source=chatgpt.com)
    return FaithfulnessMetric(
        threshold=threshold,
        model=model,
        include_reason=True,
        verbose_mode=verbose_mode,
        #evaluation_template=, # optional custom prompt template if you want to tweak the default one
    )


def build_groundedness_policy_geval(
    *, model: str, threshold: float, verbose_mode: bool
) -> GEval:
    """GEval as a stricter "policy gate" on top of Faithfulness.

    Output is structured JSON so scoring is deterministic outside the judge.
    """
    return GEval(
        name="ToxTemp Groundedness Policy (counts)",
        evaluation_steps=[
            "Task: evaluate whether ACTUAL_OUTPUT is grounded ONLY in RETRIEVAL_CONTEXT. No outside knowledge.",
            "",
            "Step 1 — Extract atomic factual claims from ACTUAL_OUTPUT limited to: entities, numbers, durations, units, methods, readouts, materials, cell lines, species, instruments, and endpoints. "
            "Do NOT extract vague/subjective claims (e.g., 'robust', 'appropriate').",
            "",
            "Step 2 — For each claim, assign exactly one label:",
            "  - ENTAILED: explicitly supported by RETRIEVAL_CONTEXT (directly stated or unambiguous paraphrase).",
            "  - CONTRADICTED: conflicts with RETRIEVAL_CONTEXT.",
            "  - UNSUPPORTED: not found in RETRIEVAL_CONTEXT and not contradicted.",
            "  - ABSTAINED: ACTUAL_OUTPUT explicitly states the information is not in the context / cannot be determined, and does not assert the missing fact.",
            "",
            "Step 3 — Return VALID JSON ONLY (no markdown, no extra text) with integer counts and an optional short audit list.",
            'Required JSON keys: {"entailed": int, "unsupported": int, "contradicted": int, "abstained": int, "total_claims": int}.',
            'Optional key: "claims": a list of up to 10 items, each with {"claim": str, "label": str, "evidence": str}. Evidence should be a short phrase from context or "N/A".',
            "",
            "Step 4 — Ensure: total_claims = entailed + unsupported + contradicted + abstained.",
        ],
        evaluation_params=[
            LLMTestCaseParams.INPUT,
            LLMTestCaseParams.ACTUAL_OUTPUT,
            LLMTestCaseParams.RETRIEVAL_CONTEXT,
        ],
        model=model,
        threshold=threshold,  # threshold still used for GEval pass/fail, but you’ll use your own score too
        verbose_mode=verbose_mode,
    )


def groundedness_policy_score(counts: dict) -> float:
    """
    Strict groundedness gate:
    - Any contradiction => score 0
    - Otherwise: supported / (supported + unsupported)
    - Abstained claims are excluded from denominator
    """
    e = int(counts.get("entailed", 0))
    u = int(counts.get("unsupported", 0))
    c = int(counts.get("contradicted", 0))

    if c > 0:
        return 0.0
    denom = e + u
    return (e / denom) if denom else 1.0  # if no asserted claims, treat as fully complian


def groundedness_herman_style_score(counts: dict) -> float:
    e = int(counts.get("entailed", 0))
    u = int(counts.get("unsupported", 0))
    c = int(counts.get("contradicted", 0))
    # a = abstained ignored by construction if judge doesn't extract them as asserted claims

    total = e + u + c
    if total == 0:
        return 1.0
    raw = (e - 0.5 * c) / total
    return max(0.0, min(1.0, raw))


def _parse_counts(reason: str | None) -> Optional[dict]:
    if not reason:
        return None
    try:
        data = json.loads(reason)
    except json.JSONDecodeError:
        return None
    return data if isinstance(data, dict) else None


def pdf_to_full_text(pdf_path: Path, *, max_pages: Optional[int] = None) -> str:
    """
    Convert a PDF to a single text string (all pages concatenated).
    Each page is prefixed with provenance: [filename p.X].
    """
    try:
        from pypdf import PdfReader
    except ImportError as e:
        raise ImportError(
            "pypdf is required for pdf_to_full_text(). Install with: pip install pypdf"
        ) from e

    reader = PdfReader(str(pdf_path))
    n_pages = len(reader.pages)
    if max_pages is not None:
        n_pages = min(n_pages, max_pages)

    pages: list[str] = []
    for i in range(n_pages):
        text = (reader.pages[i].extract_text() or "").strip()
        if not text:
            continue
        pages.append(f"[{pdf_path.name} p.{i+1}] {text}")

    if not pages:
        return f"[{pdf_path.name}] (no extractable text)"

    return "\n\n---\n\n".join(pages)


def match_pdf_to_csv(csv_path: Path, pdf_dir: Path) -> Optional[Path]:
    """
    Find the corresponding PDF file for a given comparison CSV.
    """
    csv_name = csv_path.stem
    if csv_name.startswith("tier1_comparison_"):
        base_name = csv_name.replace("tier1_comparison_", "")
    else:
        base_name = csv_name

    pdf_path = pdf_dir / f"{base_name}.pdf"
    if pdf_path.exists():
        return pdf_path

    for pdf_file in pdf_dir.glob("*.pdf"):
        if pdf_file.stem.lower() == base_name.lower():
            return pdf_file

    return None


def run_case(
    *,
    user_input: str,
    actual_output: str,
    retrieval_context: list[str] | str,
    judge_model: str = "gpt-5-nano",
    faithfulness_threshold: float = 0.5,
    use_geval_policy: bool = True,
    geval_threshold: float = 0.5,
    verbose_mode: bool = False,
) -> dict:
    """Execute the faithfulness metric plus optional groundedness policy gate on one example."""
    if isinstance(retrieval_context, str):
        retrieval_context = [retrieval_context]

    test_case = LLMTestCase(
        input=user_input,
        actual_output=actual_output,
        retrieval_context=retrieval_context,
    )

    results: dict = {}

    faith = build_faithfulness_metric(
        model=judge_model,
        threshold=faithfulness_threshold,
        verbose_mode=verbose_mode,
    )
    faith.measure(test_case)
    results["faithfulness_score"] = float(faith.score)
    results["faithfulness_reason"] = faith.reason or ""

    counts: Optional[dict] = None
    if use_geval_policy:
        policy = build_groundedness_policy_geval(
            model=judge_model,
            threshold=geval_threshold,
            verbose_mode=verbose_mode,
        )
        policy.measure(test_case)
        results["geval_policy_score"] = float(policy.score)
        results["geval_policy_reason"] = policy.reason or ""
        counts = _parse_counts(policy.reason)
    else:
        results["geval_policy_score"] = None
        results["geval_policy_reason"] = None

    if counts:
        results["groundedness_policy_score"] = groundedness_policy_score(counts)
        results["groundedness_herman_score"] = groundedness_herman_style_score(counts)
        results["groundedness_counts"] = counts
    else:
        results["groundedness_policy_score"] = None
        results["groundedness_herman_score"] = None
        results["groundedness_counts"] = None

    if verbose_mode:
        print("\n=== Eval Results ===")
        for k, v in results.items():
            print(f"{k}: {v}")

    return results


def add_groundedness_columns(
    *,
    csv_path: Path,
    pdf_path: Path,
    output_path: Optional[Path] = None,
    judge_model: str = "gpt-4o-mini",
    faithfulness_threshold: float = 0.5,
    geval_threshold: float = 0.5,
    verbose: bool = False,
) -> Path:
    """
    Add groundedness-related columns to a tier1_comparison CSV using a PDF as context.
    """
    df = pd.read_csv(csv_path)
    pdf_text = pdf_to_full_text(pdf_path)

    results = []
    for _, row in df.iterrows():
        results.append(
            run_case(
                user_input=row.get("question", ""),
                actual_output=row.get("llm_answer", ""),
                retrieval_context=pdf_text,
                judge_model=judge_model,
                faithfulness_threshold=faithfulness_threshold,
                use_geval_policy=True,
                geval_threshold=geval_threshold,
                verbose_mode=verbose,
            )
        )

    df["faithfulness_score"] = [r["faithfulness_score"] for r in results]
    df["faithfulness_reason"] = [r["faithfulness_reason"] for r in results]
    df["geval_policy_score"] = [r["geval_policy_score"] for r in results]
    df["geval_policy_reason"] = [r["geval_policy_reason"] for r in results]
    df["groundedness_policy_score"] = [r["groundedness_policy_score"] for r in results]
    df["groundedness_herman_score"] = [r["groundedness_herman_score"] for r in results]
    df["groundedness_counts"] = [r["groundedness_counts"] for r in results]

    final_path = output_path or csv_path
    df.to_csv(final_path, index=False)
    return final_path


def add_groundedness_to_directory(
    *,
    output_dir: Path,
    pdf_dir: Path,
    judge_model: str = "gpt-5-nano",
    faithfulness_threshold: float = 0.5,
    geval_threshold: float = 0.5,
    max_pdfs: Optional[int] = None,
    verbose: bool = False,
) -> None:
    csv_files = sorted(output_dir.glob("tier1_comparison_*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No tier1_comparison_*.csv files found in: {output_dir}")

    if max_pdfs:
        csv_files = csv_files[:max_pdfs]

    for csv_path in csv_files:
        pdf_path = match_pdf_to_csv(csv_path, pdf_dir)
        if not pdf_path:
            print(f"WARNING: No matching PDF found for {csv_path.name}, skipping")
            continue
        out_path = add_groundedness_columns(
            csv_path=csv_path,
            pdf_path=pdf_path,
            output_path=csv_path,
            judge_model=judge_model,
            faithfulness_threshold=faithfulness_threshold,
            geval_threshold=geval_threshold,
            verbose=verbose,
        )
        print(f"Updated: {out_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Add groundedness columns to positive_control CSVs.")
    parser.add_argument("--csv", type=Path, help="Path to a single tier1_comparison_*.csv")
    parser.add_argument("--pdf", type=Path, help="Path to the corresponding PDF")
    parser.add_argument("--output-dir", type=Path, help="Directory with tier1_comparison_*.csv files")
    parser.add_argument("--pdf-dir", type=Path, help="Directory with PDF inputs")
    parser.add_argument("--judge-model", default="gpt-5-nano", help="LLM judge to use")
    parser.add_argument("--faith-threshold", type=float, default=0.5, help="Faithfulness threshold")
    parser.add_argument("--geval-threshold", type=float, default=0.5, help="GEval threshold")
    parser.add_argument("--max-pdfs", type=int, default=None, help="Optional limit for batch runs")
    parser.add_argument("--verbose", action="store_true", help="Print per-row details")
    args = parser.parse_args()

    if args.csv and args.pdf:
        out_path = add_groundedness_columns(
            csv_path=args.csv,
            pdf_path=args.pdf,
            output_path=args.csv,
            judge_model=args.judge_model,
            faithfulness_threshold=args.faith_threshold,
            geval_threshold=args.geval_threshold,
            verbose=args.verbose,
        )
        print(f"Updated: {out_path}")
        return

    if args.output_dir and args.pdf_dir:
        add_groundedness_to_directory(
            output_dir=args.output_dir,
            pdf_dir=args.pdf_dir,
            judge_model=args.judge_model,
            faithfulness_threshold=args.faith_threshold,
            geval_threshold=args.geval_threshold,
            max_pdfs=args.max_pdfs,
            verbose=args.verbose,
        )
        return

    parser.error("Provide --csv and --pdf, or --output-dir and --pdf-dir")


if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] [--csv CSV] [--pdf PDF]
                             [--output-dir OUTPUT_DIR] [--pdf-dir PDF_DIR]
                             [--judge-model JUDGE_MODEL]
                             [--faith-threshold FAITH_THRESHOLD]
                             [--geval-threshold GEVAL_THRESHOLD]
                             [--max-pdfs MAX_PDFS] [--verbose]
ipykernel_launcher.py: error: argument --faith-threshold: invalid float value: '/Users/johannehouweling/Library/Jupyter/runtime/kernel-v3b939bfd0d4c5cfe84cdb75f615675376d06f090f.json'


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
def build_faithfulness_metric(
    *, model: str, threshold: float, verbose_mode: bool
) -> FaithfulnessMetric:
    # FaithfulnessMetric expects retrieval_context in the LLMTestCase.  [oai_citation:4‡deepeval.com](https://deepeval.com/docs/metrics-faithfulness?utm_source=chatgpt.com)
    return FaithfulnessMetric(
        threshold=threshold,
        model=model,
        include_reason=True,
        verbose_mode=verbose_mode,
        #evaluation_template=, # optional custom prompt template if you want to tweak the default one
    )

In [None]:
import argparse
import json
from pathlib import Path
from typing import Optional

import pandas as pd
from deepeval.metrics import FaithfulnessMetric, GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

from toxtempass import LLM_API_KEY, config

In [None]:
#!/usr/bin/env python3

import argparse
import json
from pathlib import Path
from typing import Optional

import pandas as pd
from deepeval.metrics import FaithfulnessMetric, GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

from toxtempass import LLM_API_KEY, config


def build_faithfulness_metric(
    *, model: str, threshold: float, verbose_mode: bool
) -> FaithfulnessMetric:
    # FaithfulnessMetric expects retrieval_context in the LLMTestCase.  [oai_citation:4‡deepeval.com](https://deepeval.com/docs/metrics-faithfulness?utm_source=chatgpt.com)
    return FaithfulnessMetric(
        threshold=threshold,
        model=model,
        include_reason=True,
        verbose_mode=verbose_mode,
        #evaluation_template=, # optional custom prompt template if you want to tweak the default one
    )


def build_groundedness_policy_geval(
    *, model: str, threshold: float, verbose_mode: bool
) -> GEval:
    """GEval as a stricter "policy gate" on top of Faithfulness.

    Output is structured JSON so scoring is deterministic outside the judge.
    """
    return GEval(
        name="ToxTemp Groundedness Policy (counts)",
        evaluation_steps=[
            "Task: evaluate whether ACTUAL_OUTPUT is grounded ONLY in RETRIEVAL_CONTEXT. No outside knowledge.",
            "",
            "Step 1 — Extract atomic factual claims from ACTUAL_OUTPUT limited to: entities, numbers, durations, units, methods, readouts, materials, cell lines, species, instruments, and endpoints. "
            "Do NOT extract vague/subjective claims (e.g., 'robust', 'appropriate').",
            "",
            "Step 2 — For each claim, assign exactly one label:",
            "  - ENTAILED: explicitly supported by RETRIEVAL_CONTEXT (directly stated or unambiguous paraphrase).",
            "  - CONTRADICTED: conflicts with RETRIEVAL_CONTEXT.",
            "  - UNSUPPORTED: not found in RETRIEVAL_CONTEXT and not contradicted.",
            "  - ABSTAINED: ACTUAL_OUTPUT explicitly states the information is not in the context / cannot be determined, and does not assert the missing fact.",
            "",
            "Step 3 — Return VALID JSON ONLY (no markdown, no extra text) with integer counts and an optional short audit list.",
            'Required JSON keys: {"entailed": int, "unsupported": int, "contradicted": int, "abstained": int, "total_claims": int}.',
            'Optional key: "claims": a list of up to 10 items, each with {"claim": str, "label": str, "evidence": str}. Evidence should be a short phrase from context or "N/A".',
            "",
            "Step 4 — Ensure: total_claims = entailed + unsupported + contradicted + abstained.",
        ],
        evaluation_params=[
            LLMTestCaseParams.INPUT,
            LLMTestCaseParams.ACTUAL_OUTPUT,
            LLMTestCaseParams.RETRIEVAL_CONTEXT,
        ],
        model=model,
        threshold=threshold,  # threshold still used for GEval pass/fail, but you’ll use your own score too
        verbose_mode=verbose_mode,
    )


def groundedness_policy_score(counts: dict) -> float:
    """
    Strict groundedness gate:
    - Any contradiction => score 0
    - Otherwise: supported / (supported + unsupported)
    - Abstained claims are excluded from denominator
    """
    e = int(counts.get("entailed", 0))
    u = int(counts.get("unsupported", 0))
    c = int(counts.get("contradicted", 0))

    if c > 0:
        return 0.0
    denom = e + u
    return (e / denom) if denom else 1.0  # if no asserted claims, treat as fully complian


def groundedness_herman_style_score(counts: dict) -> float:
    e = int(counts.get("entailed", 0))
    u = int(counts.get("unsupported", 0))
    c = int(counts.get("contradicted", 0))
    # a = abstained ignored by construction if judge doesn't extract them as asserted claims

    total = e + u + c
    if total == 0:
        return 1.0
    raw = (e - 0.5 * c) / total
    return max(0.0, min(1.0, raw))


def _parse_counts(reason: str | None) -> Optional[dict]:
    if not reason:
        return None
    try:
        data = json.loads(reason)
    except json.JSONDecodeError:
        return None
    return data if isinstance(data, dict) else None


def pdf_to_full_text(pdf_path: Path, *, max_pages: Optional[int] = None) -> str:
    """
    Convert a PDF to a single text string (all pages concatenated).
    Each page is prefixed with provenance: [filename p.X].
    """
    try:
        from pypdf import PdfReader
    except ImportError as e:
        raise ImportError(
            "pypdf is required for pdf_to_full_text(). Install with: pip install pypdf"
        ) from e

    reader = PdfReader(str(pdf_path))
    n_pages = len(reader.pages)
    if max_pages is not None:
        n_pages = min(n_pages, max_pages)

    pages: list[str] = []
    for i in range(n_pages):
        text = (reader.pages[i].extract_text() or "").strip()
        if not text:
            continue
        pages.append(f"[{pdf_path.name} p.{i+1}] {text}")

    if not pages:
        return f"[{pdf_path.name}] (no extractable text)"

    return "\n\n---\n\n".join(pages)


def match_pdf_to_csv(csv_path: Path, pdf_dir: Path) -> Optional[Path]:
    """
    Find the corresponding PDF file for a given comparison CSV.
    """
    csv_name = csv_path.stem
    if csv_name.startswith("tier1_comparison_"):
        base_name = csv_name.replace("tier1_comparison_", "")
    else:
        base_name = csv_name

    pdf_path = pdf_dir / f"{base_name}.pdf"
    if pdf_path.exists():
        return pdf_path

    for pdf_file in pdf_dir.glob("*.pdf"):
        if pdf_file.stem.lower() == base_name.lower():
            return pdf_file

    return None


def run_case(
    *,
    user_input: str,
    actual_output: str,
    retrieval_context: list[str] | str,
    judge_model: str = "gpt-5-nano",
    faithfulness_threshold: float = 0.5,
    use_geval_policy: bool = True,
    geval_threshold: float = 0.5,
    verbose_mode: bool = False,
) -> dict:
    """Execute the faithfulness metric plus optional groundedness policy gate on one example."""
    if isinstance(retrieval_context, str):
        retrieval_context = [retrieval_context]

    test_case = LLMTestCase(
        input=user_input,
        actual_output=actual_output,
        retrieval_context=retrieval_context,
    )

    results: dict = {}

    faith = build_faithfulness_metric(
        model=judge_model,
        threshold=faithfulness_threshold,
        verbose_mode=verbose_mode,
    )
    faith.measure(test_case)
    results["faithfulness_score"] = float(faith.score)
    results["faithfulness_reason"] = faith.reason or ""

    counts: Optional[dict] = None
    if use_geval_policy:
        policy = build_groundedness_policy_geval(
            model=judge_model,
            threshold=geval_threshold,
            verbose_mode=verbose_mode,
        )
        policy.measure(test_case)
        results["geval_policy_score"] = float(policy.score)
        results["geval_policy_reason"] = policy.reason or ""
        counts = _parse_counts(policy.reason)
    else:
        results["geval_policy_score"] = None
        results["geval_policy_reason"] = None

    if counts:
        results["groundedness_policy_score"] = groundedness_policy_score(counts)
        results["groundedness_herman_score"] = groundedness_herman_style_score(counts)
        results["groundedness_counts"] = counts
    else:
        results["groundedness_policy_score"] = None
        results["groundedness_herman_score"] = None
        results["groundedness_counts"] = None

    if verbose_mode:
        print("\n=== Eval Results ===")
        for k, v in results.items():
            print(f"{k}: {v}")

    return results


def add_groundedness_columns(
    *,
    csv_path: Path,
    pdf_path: Path,
    output_path: Optional[Path] = None,
    judge_model: str = "gpt-4o-mini",
    faithfulness_threshold: float = 0.5,
    geval_threshold: float = 0.5,
    verbose: bool = False,
) -> Path:
    """
    Add groundedness-related columns to a tier1_comparison CSV using a PDF as context.
    """
    df = pd.read_csv(csv_path)
    pdf_text = pdf_to_full_text(pdf_path)

    results = []
    for _, row in df.iterrows():
        results.append(
            run_case(
                user_input=row.get("question", ""),
                actual_output=row.get("llm_answer", ""),
                retrieval_context=pdf_text,
                judge_model=judge_model,
                faithfulness_threshold=faithfulness_threshold,
                use_geval_policy=True,
                geval_threshold=geval_threshold,
                verbose_mode=verbose,
            )
        )

    df["faithfulness_score"] = [r["faithfulness_score"] for r in results]
    df["faithfulness_reason"] = [r["faithfulness_reason"] for r in results]
    df["geval_policy_score"] = [r["geval_policy_score"] for r in results]
    df["geval_policy_reason"] = [r["geval_policy_reason"] for r in results]
    df["groundedness_policy_score"] = [r["groundedness_policy_score"] for r in results]
    df["groundedness_herman_score"] = [r["groundedness_herman_score"] for r in results]
    df["groundedness_counts"] = [r["groundedness_counts"] for r in results]

    final_path = output_path or csv_path
    df.to_csv(final_path, index=False)
    return final_path


def add_groundedness_to_directory(
    *,
    output_dir: Path,
    pdf_dir: Path,
    judge_model: str = "gpt-5-nano",
    faithfulness_threshold: float = 0.5,
    geval_threshold: float = 0.5,
    max_pdfs: Optional[int] = None,
    verbose: bool = False,
) -> None:
    csv_files = sorted(output_dir.glob("tier1_comparison_*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No tier1_comparison_*.csv files found in: {output_dir}")

    if max_pdfs:
        csv_files = csv_files[:max_pdfs]

    for csv_path in csv_files:
        pdf_path = match_pdf_to_csv(csv_path, pdf_dir)
        if not pdf_path:
            print(f"WARNING: No matching PDF found for {csv_path.name}, skipping")
            continue
        out_path = add_groundedness_columns(
            csv_path=csv_path,
            pdf_path=pdf_path,
            output_path=csv_path,
            judge_model=judge_model,
            faithfulness_threshold=faithfulness_threshold,
            geval_threshold=geval_threshold,
            verbose=verbose,
        )
        print(f"Updated: {out_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Add groundedness columns to positive_control CSVs.")
    parser.add_argument("--csv", type=Path, help="Path to a single tier1_comparison_*.csv")
    parser.add_argument("--pdf", type=Path, help="Path to the corresponding PDF")
    parser.add_argument("--output-dir", type=Path, help="Directory with tier1_comparison_*.csv files")
    parser.add_argument("--pdf-dir", type=Path, help="Directory with PDF inputs")
    parser.add_argument("--judge-model", default="gpt-5-nano", help="LLM judge to use")
    parser.add_argument("--faith-threshold", type=float, default=0.5, help="Faithfulness threshold")
    parser.add_argument("--geval-threshold", type=float, default=0.5, help="GEval threshold")
    parser.add_argument("--max-pdfs", type=int, default=None, help="Optional limit for batch runs")
    parser.add_argument("--verbose", action="store_true", help="Print per-row details")
    args = parser.parse_args()

    if args.csv and args.pdf:
        out_path = add_groundedness_columns(
            csv_path=args.csv,
            pdf_path=args.pdf,
            output_path=args.csv,
            judge_model=args.judge_model,
            faithfulness_threshold=args.faith_threshold,
            geval_threshold=args.geval_threshold,
            verbose=args.verbose,
        )
        print(f"Updated: {out_path}")
        return

    if args.output_dir and args.pdf_dir:
        add_groundedness_to_directory(
            output_dir=args.output_dir,
            pdf_dir=args.pdf_dir,
            judge_model=args.judge_model,
            faithfulness_threshold=args.faith_threshold,
            geval_threshold=args.geval_threshold,
            max_pdfs=args.max_pdfs,
            verbose=args.verbose,
        )
        return

    parser.error("Provide --csv and --pdf, or --output-dir and --pdf-dir")


In [None]:
verbose=True)

SyntaxError: unmatched ')' (<ipython-input-5-209f11fe28e4>, line 1)

In [None]:
from pathlib import Path

from myocyte.toxtempass.evaluation.post_processing.groundedness import add_groundedness_columns


csv_path = "/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv"
pdf_path = "/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input/gpt-4.1-nano/cMINC(UKN2).pdf"

add_groundedness_columns(
            csv_path=csv_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

ModuleNotFoundError: No module named 'myocyte.toxtempass'

In [None]:

csv_path = "/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv"
pdf_path = "/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input/gpt-4.1-nano/cMINC(UKN2).pdf"

add_groundedness_columns(
            csv_path=csv_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

FileNotFoundError: [Errno 2] No such file or directory: '/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input/gpt-4.1-nano/cMINC(UKN2).pdf'

In [None]:
csv_path = "/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv"
pdf_path = "/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf"

add_groundedness_columns(
            csv_path=csv_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

AttributeError: 'str' object has no attribute 'name'

In [None]:
csv_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv")
pdf_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf")

add_groundedness_columns(
            csv_path=csv_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

Output()

KeyboardInterrupt: 

**************************************************
Faithfulness Verbose Logs
**************************************************

Truths (limit=None):
[
    "The cMINC UKN2 assay is an in vitro test method that assesses impairment of migration of human neural crest cells (NCCs) generated from human induced pluripotent stem cells (hiPSCs).",
    "Version 2.0 of the cMINC protocol (V2.0) was deposited in March 2023, and a previous version was assembled in 2019 in the context of the EU-ToxRisk project.",
    "The hiPSC line used is IMR90_clone_#4, which was obtained from WiCell (Wisconsin) in 2012 and has been maintained with a master stock and working stocks.",
    "ATCC number for the cell line is CCL-186; the cells are of human origin (Homo sapiens) and originate from lung tissue, described as fibroblasts.",
    "Undifferentiated hiPSCs are maintained as colonies on Laminin-521 in Essential 8 (E8) medium and can be passaged with weak splitting; the cells show self-renewal and pluripoten

In [None]:
df = pd.read_csv(csv_path)

In [None]:
df

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
0,Provide a descriptive title using normal langu...,Assay to test impairment of migration of human...,Medium,The answer partially addresses the question by...,Answer not found in documents.,0.138902,-0.278100,-0.013682,-0.160179
1,Please describe in no more than 200 words the ...,,,,Answer not found in documents.,0.147642,-1.069993,-1.070517,-1.061937
2,"Which toxicological target (organ, tissue, phy...",,Low,No answer provided; treated as missing.,The assay models the impairment of neural cres...,0.020093,-1.069993,-1.070517,-1.061937
3,Which test system and readout(s) are used? (4....,,Low,No answer provided; treated as missing.,The test system used in the assay consists of ...,0.028660,-1.069993,-1.070517,-1.061937
4,Which biological process(es) (e.g. neurite out...,,Low,No answer provided; treated as missing.,The assay models the impairment of neural cres...,0.018484,-1.069993,-1.070517,-1.061937
...,...,...,...,...,...,...,...,...,...
72,Has the test system been transferred to other ...,The assay hasn’t been transferred or applied i...,Low,The answer does not address the majority of th...,Answer not found in documents.,0.189700,0.177304,0.217136,0.200128
73,Are there special legal requirements for runni...,No specific requirements.,Low,The answer is too brief and lacks detail. It d...,Answer not found in documents.,0.194032,0.259338,0.114473,0.186037
74,Are the SDSs for all hazardous reagents used i...,SDS are available in the university of Konstan...,Low,The answer only partially addresses the availa...,Answer not found in documents.,0.178659,-0.166175,0.148642,-0.028020
75,"Are special permits (e.g. genetic work, stem c...",Work requires S1 cell culture laboratories (ge...,Medium,The answer partially addresses the question by...,Answer not found in documents.,0.114859,-0.126483,0.071457,-0.032778


In [None]:
df[:,5:7]

InvalidIndexError: (slice(None, None, None), slice(5, 7, None))

In [None]:
df.iloc[:,5:7]

Unnamed: 0,cos_similarity,bert_precision
0,0.138902,-0.278100
1,0.147642,-1.069993
2,0.020093,-1.069993
3,0.028660,-1.069993
4,0.018484,-1.069993
...,...,...
72,0.189700,0.177304
73,0.194032,0.259338
74,0.178659,-0.166175
75,0.114859,-0.126483


In [None]:
df.iloc[5:7,:]

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
5,To which (human) adverse outcome(s) is your te...,,Low,No answer provided; treated as missing.,The assay models the impairment of neural cres...,0.011842,-1.069993,-1.070517,-1.061937
6,Which hazard(s) do(es) your test method (poten...,,Low,No answer provided; treated as missing.,The assay predicts developmental disorders and...,0.031322,-1.069993,-1.070517,-1.061937


In [None]:
df.iloc[21:23,:]

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
21,Name known causes of variability of the initia...,"- hiPSC can be maintained up to 10 passages, h...",Medium,The answer partially addresses the question by...,Answer not found in documents.,0.094177,-0.367479,-0.059694,-0.23632
22,Describe the principles of the selected differ...,Figure 2: Differentiation scheme from hiPSCs t...,Medium,The answer provides a detailed description of ...,"The assay described, cMINC (UKN2), utilizes hu...",0.815375,0.069368,-0.06919,-0.000335


In [None]:
df.iloc[24:25,:]

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
24,Describe the test system as it is used in the ...,A highly homogeneous pre-differentiated popula...,Medium,The answer provides a detailed description of ...,The assay uses human neural crest cells (NCCs)...,0.660511,0.207217,0.018954,0.109221


In [None]:
df.iloc[24:26,:]

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
24,Describe the test system as it is used in the ...,A highly homogeneous pre-differentiated popula...,Medium,The answer provides a detailed description of ...,The assay uses human neural crest cells (NCCs)...,0.660511,0.207217,0.018954,0.109221
25,What are the endpoint(s) that you use to contr...,Figure 3: Characterization of the cellular sys...,Medium,The answer provides some relevant information ...,Answer not found in documents.,0.139616,-0.496027,-0.030358,-0.325143


In [None]:
df.iloc[27:28,:]

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
27,Give known causes of variability for final tes...,Causes of variability: - High passage number o...,Medium,The answer partially addresses the question by...,The known causes of variability for the final ...,0.653235,0.237422,-0.078659,0.061494


In [None]:
df.iloc[24:36,:]

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
24,Describe the test system as it is used in the ...,A highly homogeneous pre-differentiated popula...,Medium,The answer provides a detailed description of ...,The assay uses human neural crest cells (NCCs)...,0.660511,0.207217,0.018954,0.109221
25,What are the endpoint(s) that you use to contr...,Figure 3: Characterization of the cellular sys...,Medium,The answer provides some relevant information ...,Answer not found in documents.,0.139616,-0.496027,-0.030358,-0.325143
26,Describe the acceptance criteria for your test...,"After compound treatment, the negative control...",Low,The answer does not adequately address the que...,The acceptance criteria for the cMINC (UKN2) a...,0.556152,0.179235,-0.240321,-0.068478
27,Give known causes of variability for final tes...,Causes of variability: - High passage number o...,Medium,The answer partially addresses the question by...,The known causes of variability for the final ...,0.653235,0.237422,-0.078659,0.061494
28,What is known about endogenous metabolic capac...,No specific information available.,Low,The answer does not address the question as it...,Answer not found in documents.,0.318805,0.303209,0.284563,0.296662
29,Are there transcriptomics data or other omics ...,Transcriptomics data (unpublished) will become...,Low,The answer does not provide a list or descript...,Answer not found in documents.,0.130411,-0.002727,0.090126,0.045599
30,Give information on where the test system diff...,- The cells are able to migrate mostly as sing...,Low,The answer does not address the question about...,Answer not found in documents.,0.091883,-0.172818,-0.004129,-0.091351
31,Are there elements of the test system that are...,The cells are not protected by patents or any ...,Medium,The answer addresses the question regarding pa...,Answer not found in documents.,0.187944,0.128503,0.218777,0.175316
32,Fill only if section 3 has not been answered. ...,Brief description is in section 3 of this file...,Medium,The answer provides a link to a detailed SOP a...,Answer not found in documents.,0.165741,-0.45863,0.07922,-0.266855
33,"Provide an exposure scheme (graphically, show ...",Figure 4: Exposure scheme and assay procedure....,High,The answer provides a detailed and clear expos...,The exposure scheme for the cMINC (UKN2) assay...,0.806302,0.427861,0.105016,0.250013


In [None]:
df.iloc[34:36,:]

Unnamed: 0,question,gtruth_answer,gtruth_answer_quality_score,gtruth_answer_quality_justification,llm_answer,cos_similarity,bert_precision,bert_recall,bert_f1
34,Define the specific endpoint(s) of the test sy...,Specific Endpoint: Migration inhibition Refere...,Medium,The answer partially addresses the question by...,The specific endpoint of the assay is migratio...,0.778617,0.170565,0.030134,0.099792
35,Define and describe the principle(s) of the an...,Migration inhibition: NCCs are plated around s...,High,The answer provides a comprehensive overview o...,The assay employs high content imaging to asse...,0.83239,0.305368,0.270602,0.290629


In [None]:
csv_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv")
pdf_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf")

demo_path = Path.cwd() / "demo_input.csv"
df = pd.read_csv(csv_path).iloc[34:36, :]
df.to_csv(demo_path, index=False)

add_groundedness_columns(
            csv_path=demo_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

Output()

Output()

Output()


=== Eval Results ===
faithfulness_score: 1.0
faithfulness_reason: The score is 1.00 because there are no contradictions in the contradiction list, indicating the actual output aligns with the retrieval context. Great job!
geval_policy_score: 1.0
geval_policy_reason: All atomic claims extracted from ACTUAL_OUTPUT are supported by the RETRIEVAL_CONTEXT (endpoints, primary vs secondary endpoints, normalization role, and measurement of migrated NCCs). No contradictions or unsupported statements present.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


Output()


=== Eval Results ===
faithfulness_score: 0.9285714285714286
faithfulness_reason: The score is 0.93 because the actual output claimed four tiles per well, but the retrieval context specifies four fields per well.
geval_policy_score: 0.9
geval_policy_reason: Actual Output largely matches Retrieval Context: NCCs, calcein-AM and Hoechst 33342 staining, live-cell imaging, migration into a cell-free zone after stopper removal, 24 h exposure, four tiles per well, ROI with 150–300 cells, double-positive cell counting, same-well measurement for migration and viability, and use of ArrayScan/Cellomics with data normalization to untreated controls. A minor point is the claim about 'not fixed' which isn't explicit in the Retrieval Context.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


PosixPath('/Users/johannehouweling/ToxTempAssistant/myocyte/groundedness_demo_output.csv')

In [None]:
csv_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv")
pdf_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf")

demo_path = Path.cwd() / "demo_input.csv"
df = pd.read_csv(csv_path).iloc[34:36, :]
df.to_csv(demo_path, index=False)

add_groundedness_columns(
            csv_path=demo_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

Output()

Output()

Output()


=== Eval Results ===
faithfulness_score: 1.0
faithfulness_reason: The score is 1.00 because there are no contradictions—the contradictions list is empty, indicating the actual output fully aligns with the retrieval context.
geval_policy_score: 1.0
geval_policy_reason: All atomic claims in ACTUAL_OUTPUT (migration inhibition endpoint, NCCs are human NCCs, migration as primary endpoint, viability as secondary/normalization endpoint, readout by migrated cell counts in the cell-free zone) are explicitly supported by the RETRIEVAL_CONTEXT; no unsupported or contradicted details are present.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


Output()


=== Eval Results ===
faithfulness_score: 1.0
faithfulness_reason: The score is 1.00 because there are no contradictions between the actual output and the retrieval context, indicating perfect faithfulness.
geval_policy_score: 0.9
geval_policy_reason: High alignment: ACTUAL_OUTPUT correctly covers NCC migration and viability endpoints, calcein-AM/Hoechst staining, four tiles per well, ROI of 150–300 cells, silicone stopper migration setup, 24 h exposure, parallel measurement of endpoints, and imaging/analysis workflow as described in the retrieval context. The claim 'cells are not fixed' is not explicitly supported by the context and is therefore unsupported.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


PosixPath('/Users/johannehouweling/ToxTempAssistant/myocyte/groundedness_demo_output.csv')

In [None]:
from pathlib import Path
import pandas as pd

from myocyte.toxtempass.evaluation.post_processing.groundedness import add_groundedness_columns, add_groundedness_to_directory

#from myocyte.toxtempass.evaluation.post_processing.groundedness import add_groundedness_columns


csv_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv")
pdf_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf")

demo_path = Path("demo_input.csv")



add_groundedness_columns(
            csv_path=demo_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

add_groundedness_to_directory(
            output_dir=,
            pdf_dir=args.pdf_dir,
            judge_model=args.judge_model,
            faithfulness_threshold=args.faith_threshold,
            geval_threshold=args.geval_threshold,
            verbose=True,
        )

SyntaxError: expected argument value expression (<ipython-input-23-8a927ed2234a>, line 27)

In [None]:
from pathlib import Path
import pandas as pd

from myocyte.toxtempass.evaluation.post_processing.groundedness import add_groundedness_columns, add_groundedness_to_directory

#from myocyte.toxtempass.evaluation.post_processing.groundedness import add_groundedness_columns


csv_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv")
pdf_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf")

demo_path = Path("demo_input.csv")



add_groundedness_columns(
            csv_path=demo_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

ModuleNotFoundError: No module named 'myocyte.toxtempass'

In [None]:
from pathlib import Path
import pandas as pd

from myocyte.toxtempass.evaluation.post_processing.groundedness import add_groundedness_columns, add_groundedness_to_directory

#from myocyte.toxtempass.evaluation.post_processing.groundedness import add_groundedness_columns


csv_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv")
pdf_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf")

demo_path = Path("demo_input.csv")

add_groundedness_columns(
            csv_path=demo_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

ModuleNotFoundError: No module named 'myocyte.toxtempass'

In [None]:
csv_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/output/gpt-4.1-nano/tier1_comparison_cMINC(UKN2).csv")
pdf_path = Path("/Users/johannehouweling/ToxTempAssistant/myocyte/toxtempass/evaluation/positive_control/input_files/cMINC(UKN2).pdf")

demo_path = Path("demo_input.csv")

add_groundedness_columns(
            csv_path=demo_path,
            pdf_path=pdf_path,
            output_path=Path.cwd() / "groundedness_demo_output.csv",
            judge_model="gpt-5-nano",
            faithfulness_threshold=0.5,
            geval_threshold=0.5,
            verbose=True
            )

Output()

Output()

Output()


=== Eval Results ===
faithfulness_score: 1.0
faithfulness_reason: The score is 1.00 because there are no contradictions between the actual output and the retrieval context; everything aligns as expected. Great job—keep it up!
geval_policy_score: 1.0
geval_policy_reason: ACTUAL_OUTPUT's atomic claims (endpoint = migration inhibition on human NCCs; primary endpoint = cell migration; secondary/normalization endpoint = cell viability; measurement of migrated NCCs into the cell-free zone) are explicitly supported by the Retrieval Context and there are no contradictions or unsupported facts; the output also references the same cMINC UKN2.pdf source.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


Output()

Output()


=== Eval Results ===
faithfulness_score: 0.9333333333333333
faithfulness_reason: The score is 0.93 because the actual output claims semi-quantitative counts, whereas the context states objective and reproducible measurement.
geval_policy_score: 1.0
geval_policy_reason: All atomic factual claims in the ACTUAL_OUTPUT (NCCs derived from hiPSCs; calcein-AM and Hoechst staining; live-cell imaging; migration into a cell-free zone around a silicone stopper; 24 h exposure; four imaging tiles per well for migration; ROI-based counting of double-positive cells; viability measured in four fields outside the migration zone; same dyes and automated analysis; parallel measurement in the same well) are explicitly supported by the RETRIEVAL_CONTEXT; there are no claims requiring outside knowledge or contradictions.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


Output()

Output()


=== Eval Results ===
faithfulness_score: 1.0
faithfulness_reason: The score is 1.00 because there are no contradictions between the actual output and the retrieval context; the output is fully faithful.
geval_policy_score: 0.7
geval_policy_reason: The output correctly notes a 24-hour exposure before migration readout and treats 24 h as the standard per the retrieval context, which mentions 24 h incubation. However, the claim about a 20-hour window is not supported by the context (the context does not mention 20 h, only 24 h and possible longer exposures), leading to partial grounding.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


Output()

Output()


=== Eval Results ===
faithfulness_score: 0.75
faithfulness_reason: The score is 0.75 because the actual output claims imaging was performed with a 10x objective, but the retrieval context states it was performed with a 5x objective.
geval_policy_score: 0.6
geval_policy_reason: The response correctly identifies three atomic facts as entailed by the retrieval context (calcein-AM and H-33342 staining for viability, four imaging fields outside migration, and 10x objective imaging). It marks the live/dead overlay claim as unsupported since the exact phrase ‘live/dead overlay’ is not present in the retrieval context. The claims are precisely mapped to the provided context.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


Output()

Output()


=== Eval Results ===
faithfulness_score: 0.6666666666666666
faithfulness_reason: The score is 0.67 because the actual output claimed coatings of poly-D-lysine and laminin, but the retrieval context specifies Matrigel-coated plates and lists coatings such as Matrigel, poly-L-ornithine (PLO), Laminin, and Fibronectin.
geval_policy_score: 0.3
geval_policy_reason: Output asserts plates are coated with poly-D-lysine and laminin. Retrieval Context describes coatings as poly-L-ornithine (PLO) with laminin/fibronectin and laminin-521; thus laminin is supported but poly-D-lysine is not present in context, making the claim not wholly grounded and partially contradicted.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


Output()

Output()


=== Eval Results ===
faithfulness_score: 0.0
faithfulness_reason: The score is 0.00 because the actual output claims analysis uses ImageJ macros and traces the full well, which contradicts the retrieval context that imaging data are acquired and analyzed with Ringassay software and Array Scan VTI HCS Reader (no ImageJ). It also misdescribes the ROI: it’s the predefined circular migration zone defined by the stopper boundary (not the entire well), with viability measurements outside the ROI, conflicting with the claim of full-well analysis.
geval_policy_score: 0.6
geval_policy_reason: Two atomic claims identified. One claims ImageJ is used; this is not supported by the Retrieval Context which mentions Ringassay software and ArrayScan/VTI HCS workflows (UNSUPPORTED). The ROI claim contradicts the context which describes ROI as the migration zone around the stopper and specifies ROI diameter, not the full well (CONTRADICTED). No abstained claims.
groundedness_policy_score: None
groundedn

Output()


=== Eval Results ===
faithfulness_score: 0.6666666666666666
faithfulness_reason: The score is 0.67 because the output claims migration was measured with a telescope across the Milky Way, but the context specifies measurement via a high-content imaging system after staining, revealing a contradiction in both method and scale.
geval_policy_score: 0.0
geval_policy_reason: Entailed/unsupported/contradicted/abstained counts: all three atomic claims in ACTUAL_OUTPUT are not found in the RETRIEVAL_CONTEXT and do not contradict it, so they are UNSUPPORTED. The total_claims reflect the three claims.
groundedness_policy_score: None
groundedness_herman_score: None
groundedness_counts: None


PosixPath('/Users/johannehouweling/ToxTempAssistant/myocyte/groundedness_demo_output.csv')