# © Artur Czarnecki. All rights reserved.
# Intergrax framework – proprietary and confidential.
# Use, modification, or distribution without written permission is prohibited.


In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

In [2]:
from intergrax.llm.messages import ChatMessage
from intergrax.llm_adapters.llm_provider import LLMProvider
from intergrax.llm_adapters.llm_provider_registry import LLMAdapterRegistry
from intergrax.runtime.nexus.config import RuntimeConfig
from intergrax.runtime.nexus.planning.engine_planner import EnginePlanner

# -------------------------------------------------------
# Global test constants
# -------------------------------------------------------

USER_ID = "demo-user-planner"
SESSION_ID = "sess_planner_only_001"

# -------------------------------------------------------
# Build LLM adapter and RuntimeConfig
# IMPORTANT: Replace build_llm_adapter() with your actual builder.
# This must be the same adapter/config used in your runtime.
# -------------------------------------------------------

llm_adapter = LLMAdapterRegistry.create(LLMProvider.OLLAMA)




# Test helpers

In [3]:
from dataclasses import dataclass
from typing import Dict, List, Optional

from intergrax.runtime.nexus.planning.engine_plan_models import (
    EngineNextStep,
    PlanIntent,
)


# ------------------------------------------------------------
# 1) TEST SET: question + expected intent (+ optional flags)
# ------------------------------------------------------------

@dataclass(frozen=True)
class PlannerTestCase:
    id: str
    question: str
    expected_intent: PlanIntent
    expected_flags: Optional[Dict[str, Optional[bool]]] = None
    expected_next_step: Optional[EngineNextStep] = None
    use_project_context: bool = False

@dataclass(frozen=True)
class PromptVariant:
    id: str
    system_prompt: str


In [None]:
from typing import Any

from intergrax.runtime.nexus.engine.runtime_state import RuntimeState
from intergrax.runtime.nexus.planning.engine_plan_models import (
    EnginePlan,
    PlannerPromptConfig,
)
from intergrax.runtime.nexus.responses.response_schema import RuntimeRequest
from intergrax.llm_adapters.llm_usage_track import LLMUsageTracker


def build_planner_request(
    *,
    user_id: str,
    session_id: str,
    message: str,
    instructions: str | None = None,
    attachments: list | None = None,
) -> RuntimeRequest:
    return RuntimeRequest(
        user_id=user_id,
        session_id=session_id,
        message=message,
        attachments=attachments or [],
        instructions=instructions,
    )


def build_planner_state(
    *,
    req: RuntimeRequest,
    run_id: str,
    config: RuntimeConfig,
    base_history: str,
) -> RuntimeState:
    """
    Minimal RuntimeState for planner-only tests.
    Uses only existing RuntimeState fields (no extra structures).
    """
    state = RuntimeState(
        request=req,
        run_id=run_id,
        llm_usage_tracker=LLMUsageTracker(run_id=run_id),
    )

    # Keep it runtime-like: register the same adapter label as core runtime does.
    state.llm_usage_tracker.register_adapter(config.llm_adapter, label="core_adapter")

    # Base history for planner tests (string -> ChatMessage list)
    if base_history and base_history.strip():
        state.base_history = [ChatMessage(role="user", content=base_history.strip())]
    else:
        state.base_history = []

    # Optional: simulate memory-layer instruction fragments (keep None for now)
    state.profile_user_instructions = None
    state.profile_org_instructions = None

    # Capabilities — explicitly set for planner tests
    state.cap_rag_available = bool(config.enable_rag)
    state.cap_user_ltm_available = bool(config.enable_user_longterm_memory)
    state.cap_attachments_available = True
    state.cap_websearch_available = bool(config.enable_websearch)
    state.cap_tools_available = bool(config.tools_mode != "off")

    return state


def _extract_plan_flags(plan: EnginePlan) -> Dict[str, bool | None]:
    """
    Extract only the flags we test. No getattr: rely on EnginePlan contract.
    """
    return {
        "use_websearch": plan.use_websearch,
        "use_tools": plan.use_tools,
        "use_rag": plan.use_rag,
        "use_user_longterm_memory": plan.use_user_longterm_memory,
        "ask_clarifying_question": plan.ask_clarifying_question,
    }



async def run_planner_testcase(
    *,
    tc: PlannerTestCase,
    system_prompt: str,
    base_history: str,
    planner: EnginePlanner,
    config: RuntimeConfig,
    user_id: str = "test-user",
    session_id: str = "test-session",
    run_id: str,
    instructions: str | None = None,
) -> Dict[str, Any]:
    """
    Run a single PlannerTestCase and return a structured result.

    Result separates intent vs flags vs next_step validation to make PASS/FAIL reasons explicit.
    """
    req = build_planner_request(
        user_id=user_id,
        session_id=session_id,
        message=tc.question,
        instructions=instructions,
    )

    state = build_planner_state(
        req=req,
        run_id=run_id,
        config=config,
        base_history=base_history,
    )

    prompt_cfg = PlannerPromptConfig(version="notebook", system_prompt=system_prompt)

    try:
        plan = await planner.plan(
            req=req,
            state=state,
            config=config,
            prompt_config=prompt_cfg,
            run_id=run_id,
        )
    except Exception as e:
        # Treat planner parsing/model-contract failures as a failed testcase
        err = f"planner_error: {type(e).__name__}: {e}"
        return {
            "id": tc.id,
            "pass": False,
            "intent_ok": False,
            "flags_ok": False,
            "next_step_ok": False,
            "intent_errors": [err],
            "flag_errors": [],
            "next_step_errors": [],
            "errors": [err],
            "expected_intent": tc.expected_intent,
            "got_intent": None,
            "expected_next_step": getattr(tc, "expected_next_step", None),
            "got_next_step": None,
            "got_flags": {},
            "debug": {
                "json_len": None,
                "raw_preview": "",
                "prompt_version": "notebook",
                "used_override": True,
            },
        }

    intent_errors: List[str] = []
    flag_errors: List[str] = []
    next_step_errors: List[str] = []

    # 1) Intent validation
    if plan.intent != tc.expected_intent:
        intent_errors.append(f"intent expected={tc.expected_intent} got={plan.intent}")

    # 2) Flags validation (only if expected_flags provided)
    got_flags = _extract_plan_flags(plan)
    if tc.expected_flags:
        for k, expected in tc.expected_flags.items():
            if expected is None:
                continue
            if got_flags[k] != expected:
                flag_errors.append(f"{k} expected={expected} got={got_flags[k]}")

    # 3) next_step validation (only if expected_next_step provided)
    expected_next_step = getattr(tc, "expected_next_step", None)
    if expected_next_step is not None:
        if plan.next_step != expected_next_step:
            next_step_errors.append(
                f"next_step expected={expected_next_step} got={plan.next_step}"
            )

    errors = intent_errors + flag_errors + next_step_errors

    intent_ok = len(intent_errors) == 0
    flags_ok = len(flag_errors) == 0
    next_step_ok = len(next_step_errors) == 0
    passed = intent_ok and flags_ok and next_step_ok

    # Minimal debug summary (avoid dumping full raw content in loops)
    dbg = plan.debug
    raw_preview = ""
    if "raw_preview" in dbg and dbg["raw_preview"]:
        raw_preview = dbg["raw_preview"][:220]

    return {
        "id": tc.id,
        "pass": passed,
        "intent_ok": intent_ok,
        "flags_ok": flags_ok,
        "next_step_ok": next_step_ok,
        "intent_errors": intent_errors,
        "flag_errors": flag_errors,
        "next_step_errors": next_step_errors,
        "errors": errors,
        "expected_intent": tc.expected_intent,
        "got_intent": plan.intent,
        "expected_next_step": expected_next_step,
        "got_next_step": plan.next_step,
        "got_flags": got_flags,
        "debug": {
            "json_len": dbg["json_len"] if "json_len" in dbg else None,
            "raw_preview": raw_preview,
            "prompt_version": dbg["planner_prompt"]["version"] if "planner_prompt" in dbg else None,
            "used_override": dbg["planner_prompt"]["used_override"] if "planner_prompt" in dbg else None,
        },
    }




def print_test_result(test: PlannerTestCase, res: Dict[str, Any]) -> None:
    status = "PASS" if res["pass"] else "FAIL"

    intent_status = "OK" if res["intent_ok"] else "FAIL"
    flags_status = "OK" if res["flags_ok"] else "FAIL"
    next_step_status = "OK" if res.get("next_step_ok", True) else "FAIL"

    print(
        f"[{res['id']}] {status}  "
        f"(INTENT={intent_status}, FLAGS={flags_status}, NEXT_STEP={next_step_status})"
    )
    print(f"  question        : {test.question}")
    print(f"  project_history : {test.use_project_context}")
    print(f"  expected_intent : {res['expected_intent']}")
    print(f"  got_intent      : {res['got_intent']}")
    print(f"  got_flags       : {res['got_flags']}")

    # next_step info (optional)
    if "expected_next_step" in res or "got_next_step" in res:
        print(f"  expected_next_step : {res.get('expected_next_step')}")
        print(f"  got_next_step      : {res.get('got_next_step')}")

    if not res["intent_ok"]:
        for e in res["intent_errors"]:
            print(f"  [intent] {e}")

    if not res["flags_ok"]:
        for e in res["flag_errors"]:
            print(f"  [flags] {e}")

    if not res.get("next_step_ok", True):
        for e in res.get("next_step_errors", []):
            print(f"  [next_step] {e}")

    if res["debug"]["raw_preview"]:
        print(f"  raw_preview: {res['debug']['raw_preview']}")

    print()




In [5]:
TESTS: List[PlannerTestCase] = [
    # ------------------------------------------------------------------
    # GENERIC
    # ------------------------------------------------------------------
    PlannerTestCase(
        id="G01",
        question="Explain how to implement an async retry strategy in Python for API calls.",
        expected_intent=PlanIntent.GENERIC,
        expected_flags={"use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    PlannerTestCase(
        id="G02",
        question="What is the difference between a mutex and a semaphore? Give a concise explanation.",
        expected_intent=PlanIntent.GENERIC,
        expected_flags={"use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    PlannerTestCase(
        id="G03",
        question="Show an example of exponential backoff with jitter in Python (pseudocode is fine).",
        expected_intent=PlanIntent.GENERIC,
        expected_flags={"use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    PlannerTestCase(
        id="G04",
        question="Explain the trade-offs between REST and GraphQL for a backend API.",
        expected_intent=PlanIntent.GENERIC,
        expected_flags={"use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    # Edge: mentions "latest" but still could be generic if framed as concept
    PlannerTestCase(
        id="G05",
        question="In general terms, what does 'tool calling' mean in modern LLM APIs?",
        expected_intent=PlanIntent.GENERIC,
        expected_flags={"use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),

    # ------------------------------------------------------------------
    # FRESHNESS (must require up-to-date info)
    # ------------------------------------------------------------------
    PlannerTestCase(
        id="F01",
        question="What are the most recent major changes to the OpenAI Responses API and tool calling? Provide dates.",
        expected_intent=PlanIntent.FRESHNESS,
        expected_flags={"use_websearch": True, "use_tools": False, "use_user_longterm_memory": False},
    ),
    PlannerTestCase(
        id="F02",
        question="Summarize the latest release notes for Python 3.13 and mention the release date.",
        expected_intent=PlanIntent.FRESHNESS,
        expected_flags={"use_websearch": True, "use_tools": False, "use_user_longterm_memory": False},
    ),
    PlannerTestCase(
        id="F03",
        question="What changed in the last 30 days in LangChain regarding agents? Provide a short dated summary.",
        expected_intent=PlanIntent.FRESHNESS,
        expected_flags={"use_websearch": True, "use_tools": False, "use_user_longterm_memory": False},
    ),
    # Edge: explicit "as of today" + dates
    PlannerTestCase(
        id="F04",
        question="As of today, what is the current stable version of FastAPI and when was it released?",
        expected_intent=PlanIntent.FRESHNESS,
        expected_flags={"use_websearch": True, "use_tools": False, "use_user_longterm_memory": False},
    ),
    # Edge: asks for "verify" => still websearch, not tools
    PlannerTestCase(
        id="F05",
        question="Verify the current Docker Compose file format version recommendation and cite the source date.",
        expected_intent=PlanIntent.FRESHNESS,
        expected_flags={"use_websearch": True, "use_tools": False, "use_user_longterm_memory": False},
    ),

    # ------------------------------------------------------------------
    # PROJECT_ARCHITECTURE (depends on project context/preferences)
    # Note: these are only meaningful with BASE_HISTORY_PROJECT or other injected context.
    # ------------------------------------------------------------------
    PlannerTestCase(
        id="P01",
        question="Continue earlier: update the Step Planner architecture so it supports dynamic planning.",
        expected_intent=PlanIntent.PROJECT_ARCHITECTURE,
        expected_flags={"use_user_longterm_memory": True, "use_websearch": False, "use_tools": False},
        use_project_context=True,
    ),
    PlannerTestCase(
        id="P02",
        question="Given our Intergrax nexus Runtime, where should we enforce planner canonicalization rules: in EnginePlanner or in Runtime?",
        expected_intent=PlanIntent.PROJECT_ARCHITECTURE,
        expected_flags={"use_user_longterm_memory": True, "use_websearch": False, "use_tools": False},
        use_project_context=True,
    ),
    PlannerTestCase(
        id="P03",
        question="For Intergrax, should we treat websearch as tools or keep it as a separate pipeline? Decide based on our existing architecture.",
        expected_intent=PlanIntent.PROJECT_ARCHITECTURE,
        expected_flags={"use_user_longterm_memory": True, "use_websearch": False, "use_tools": False},
        use_project_context=True,
    ),
    # Edge: project wording without context should not always trigger project_architecture,
    # but with BASE_HISTORY_PROJECT it should.
    PlannerTestCase(
        id="P04",
        question="In our current notebook-based test approach, what is the best way to version planner prompts and track regressions?",
        expected_intent=PlanIntent.PROJECT_ARCHITECTURE,
        expected_flags={"use_user_longterm_memory": True, "use_websearch": False, "use_tools": False},
        use_project_context=True,
    ),

    # ------------------------------------------------------------------
    # CLARIFY (ambiguous/missing required info; must ask one clarifying question)
    # ------------------------------------------------------------------
    PlannerTestCase(
        id="C01",
        question="Which one should I choose for my project?",
        expected_intent=PlanIntent.CLARIFY,
        expected_flags={"ask_clarifying_question": True, "use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    PlannerTestCase(
        id="C02",
        question="Help me optimize this. What should I change?",
        expected_intent=PlanIntent.CLARIFY,
        expected_flags={"ask_clarifying_question": True, "use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    PlannerTestCase(
        id="C03",
        question="How should we do it in Intergrax?",
        expected_intent=PlanIntent.CLARIFY,
        expected_flags={"ask_clarifying_question": True, "use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    # Edge: contains "project" but still missing options/criteria -> clarify
    PlannerTestCase(
        id="C04",
        question="For my project, should I pick the first approach or the second one?",
        expected_intent=PlanIntent.CLARIFY,
        expected_flags={"ask_clarifying_question": True, "use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),
    # Edge: ambiguous target (library? language? platform?)
    PlannerTestCase(
        id="C05",
        question="Can you compare them and tell me what's better?",
        expected_intent=PlanIntent.CLARIFY,
        expected_flags={"ask_clarifying_question": True, "use_websearch": False, "use_tools": False, "use_user_longterm_memory": False},
    ),

    PlannerTestCase(
        id="DL01",
        question="What are the most recent major changes to the OpenAI Responses API and tool calling? Provide dates.",
        expected_intent=PlanIntent.FRESHNESS,
        expected_flags={"use_websearch": True, "use_tools": False},
        expected_next_step=EngineNextStep.WEBSEARCH,
    ),

    PlannerTestCase(
        id="DL02",
        question="I have an error in my Python code but I didn't paste it. What should I do?",
        expected_intent=PlanIntent.CLARIFY,
        expected_flags={"ask_clarifying_question": True},
        expected_next_step=EngineNextStep.CLARIFY,
    ),

    PlannerTestCase(
        id="DL03",
        question="Based on our Intergrax runtime design, how should the Engine Planner and Step Planner integrate in a dynamic loop?",
        expected_intent=PlanIntent.PROJECT_ARCHITECTURE,
        expected_flags={"use_user_longterm_memory": True},
        expected_next_step=EngineNextStep.RAG,
    ),

    PlannerTestCase(
        id="DL04",
        question="Summarize the pros/cons of dynamic planning vs single-plan execution for chatGPT-like systems.",
        expected_intent=PlanIntent.GENERIC,
        expected_flags={"use_websearch": False, "use_tools": False},
        expected_next_step=EngineNextStep.SYNTHESIZE,
    ),

    PlannerTestCase(
        id="DL05",
        question="Use web search to verify the latest version of Python and summarize what's new.",
        expected_intent=PlanIntent.FRESHNESS,
        expected_flags={"use_websearch": True},
        expected_next_step=EngineNextStep.WEBSEARCH,
    )

]


In [6]:
BASE_HISTORY_EMPTY: str = ""

BASE_HISTORY_PROJECT: str = (
    "User: I am Artur. I build Intergrax and Mooff.\n"
    "User: We are working on Intergrax nexus Runtime.\n"
    "User: We are implementing a Step Planner with modes: none/static/dynamic.\n"
    "User: Observed issues: LTM always-on in generic; websearch vs tools conflation.\n"
    "User: I prefer concise, technical answers. Never use emojis in code/docs.\n"
)

# async def run_all(*, system_prompt_variant: str):
#     from tqdm.auto import tqdm

#     llm_adapter = LLMAdapterRegistry.create(LLMProvider.OLLAMA)

#     config = RuntimeConfig(
#         llm_adapter=llm_adapter,
#         enable_rag=True,
#         enable_websearch=True,
#         tools_mode="auto",
#         enable_user_longterm_memory=True,
#     )

#     planner = EnginePlanner(llm_adapter=llm_adapter)

#     for test in tqdm(TESTS, desc="Planner tests"):
#         base_history = BASE_HISTORY_PROJECT if test.use_project_context else BASE_HISTORY_EMPTY

#         res = await run_planner_testcase(
#             tc=test,
#             system_prompt=system_prompt_variant,
#             base_history=base_history,
#             planner=planner,
#             config=config,
#             run_id=test.id,
#         )

#         print_test_result(test, res)


async def run_all(
    *,
    system_prompt_variant: str,
    verbose: bool = False,
) -> Dict[str, Any]:
    
    from tqdm.auto import tqdm
    llm_adapter = LLMAdapterRegistry.create(LLMProvider.OLLAMA)

    config = RuntimeConfig(
        llm_adapter=llm_adapter,
        enable_rag=True,
        enable_websearch=True,
        tools_mode="auto",
        enable_user_longterm_memory=True,
    )

    planner = EnginePlanner(llm_adapter=llm_adapter)

    per_test: List[Dict[str, Any]] = []

    totals = {
        "total": 0,
        "pass": 0,
        "intent_ok": 0,
        "flags_ok": 0,
    }

    per_group: Dict[str, Dict[str, int]] = {}
    fails_compact: List[Dict[str, Any]] = []

    for test in tqdm(TESTS, desc="Planner tests"):
        base_history = BASE_HISTORY_PROJECT if test.use_project_context else BASE_HISTORY_EMPTY

        res = await run_planner_testcase(
            tc=test,
            system_prompt=system_prompt_variant,
            base_history=base_history,
            planner=planner,
            config=config,
            run_id=test.id,
        )

        per_test.append(res)

        # Totals
        totals["total"] += 1
        if res["pass"]:
            totals["pass"] += 1
        if res["intent_ok"]:
            totals["intent_ok"] += 1
        if res["flags_ok"]:
            totals["flags_ok"] += 1

        # Group breakdown (G/F/P/C)
        group = test.id[0]
        if group not in per_group:
            per_group[group] = {"total": 0, "pass": 0, "intent_ok": 0, "flags_ok": 0}
        per_group[group]["total"] += 1
        if res["pass"]:
            per_group[group]["pass"] += 1
        if res["intent_ok"]:
            per_group[group]["intent_ok"] += 1
        if res["flags_ok"]:
            per_group[group]["flags_ok"] += 1

        # Compact fail diagnostics
        if not res["pass"]:
            fails_compact.append(
                {
                    "test_id": res["id"],
                    "group": group,
                    "intent_ok": res["intent_ok"],
                    "flags_ok": res["flags_ok"],
                    "intent_errors": res["intent_errors"],
                    "flag_errors": res["flag_errors"],
                }
            )

        if verbose:
            print_test_result(test, res)

    def pct(x: int, denom: int) -> float:
        if denom <= 0:
            return 0.0
        return (100.0 * float(x)) / float(denom)

    summary = {
        "totals": totals,
        "accuracy": pct(totals["pass"], totals["total"]),
        "intent_accuracy": pct(totals["intent_ok"], totals["total"]),
        "flags_accuracy": pct(totals["flags_ok"], totals["total"]),
        "per_group": {
            g: {
                **st,
                "accuracy": pct(st["pass"], st["total"]),
                "intent_accuracy": pct(st["intent_ok"], st["total"]),
                "flags_accuracy": pct(st["flags_ok"], st["total"]),
            }
            for g, st in per_group.items()
        },
        "fails": fails_compact,
        # Keep full per_test only if you need deep drill-down later
        "per_test": per_test,
    }

    return summary


async def run_prompt_variants(
    *,
    variants: List[PromptVariant],
    verbose_each: bool = False,
) -> List[Dict[str, Any]]:
    results: List[Dict[str, Any]] = []

    for v in variants:
        stats = await run_all(system_prompt_variant=v.system_prompt, verbose=verbose_each)
        results.append(
            {
                "variant_id": v.id,
                "accuracy": stats["accuracy"],
                "intent_accuracy": stats["intent_accuracy"],
                "flags_accuracy": stats["flags_accuracy"],
                "stats": stats,
            }
        )

    # Sort: overall pass rate, then flags, then intent
    results.sort(
        key=lambda r: (r["accuracy"], r["flags_accuracy"], r["intent_accuracy"]),
        reverse=True,
    )

    return results


In [7]:
# from intergrax.runtime.nexus.planning.engine_plan_models import DEFAULT_PLANNER_SYSTEM_PROMPT


# await run_all(system_prompt_variant=DEFAULT_PLANNER_SYSTEM_PROMPT)


In [8]:
# system_prompt_variant = BASE_PLANNER_SYSTEM_PROMPT + """

# User long-term memory policy (STRICT, HARD RULE):
# - use_user_longterm_memory MUST be true ONLY when intent is exactly 'project_architecture' AND the capability is available.
# - For intents 'generic', 'freshness', and 'clarify': use_user_longterm_memory MUST be false.

# Websearch vs Tools policy (STRICT, HARD RULE):
# - In this runtime, websearch is NOT part of tools. Websearch is a separate pipeline.
# - Therefore, if use_websearch=true, then use_tools MUST be false.
# - Set use_tools=true ONLY for non-websearch external actions handled by the tools pipeline.
# """

# await run_all(system_prompt_variant=system_prompt_variant)

In [9]:
from intergrax.runtime.nexus.planning.engine_plan_models import DEFAULT_PLANNER_SYSTEM_PROMPT


variants = [
    # PromptVariant(id="base", system_prompt=BASE_PLANNER_SYSTEM_PROMPT),

#     PromptVariant(id="ltm_strict",system_prompt=BASE_PLANNER_SYSTEM_PROMPT + """
#         User long-term memory policy (STRICT, HARD RULE):
#         - use_user_longterm_memory MUST be true ONLY when intent is exactly 'project_architecture' AND the capability is available.
#         - For intents 'generic', 'freshness', and 'clarify': use_user_longterm_memory MUST be false.
#         """),

#     PromptVariant(id="ltm_strict_web_not_tools", system_prompt=BASE_PLANNER_SYSTEM_PROMPT + """
#         User long-term memory policy (STRICT, HARD RULE):
#         - use_user_longterm_memory MUST be true ONLY when intent is exactly 'project_architecture' AND the capability is available.
#         - For intents 'generic', 'freshness', and 'clarify': use_user_longterm_memory MUST be false.

#         Websearch vs Tools policy (STRICT, HARD RULE):
#         - In this runtime, websearch is NOT part of tools. Websearch is a separate pipeline.
#         - Therefore, if use_websearch=true, then use_tools MUST be false.
#         - Set use_tools=true ONLY for non-websearch external actions handled by the tools pipeline.
#         """,
#     ),

#     PromptVariant(id="ltm_strict_web_not_tools + clarify_strict + enum_guard", system_prompt = BASE_PLANNER_SYSTEM_PROMPT + """
#         Intent field constraints (STRICT):
#         - intent MUST be EXACTLY one of: "generic", "freshness", "project_architecture", "clarify".
#         - Do NOT output any other intent value (e.g., "compare", "choose", "optimize", "decision").

#         User long-term memory policy (STRICT, HARD RULE):
#         - use_user_longterm_memory MUST be true ONLY when intent is exactly "project_architecture" AND the capability is available.
#         - For intents "generic", "freshness", and "clarify": use_user_longterm_memory MUST be false.

#         Websearch vs Tools policy (STRICT, HARD RULE):
#         - In this runtime, websearch is NOT part of tools. Websearch is a separate pipeline.
#         - Therefore, if use_websearch=true, then use_tools MUST be false.
#         - Set use_tools=true ONLY for non-websearch external actions handled by the tools pipeline.

#         Clarify policy (STRICT, HARD RULE):
#         - Use intent="clarify" when the user's request is ambiguous or missing required details to answer.
#         - Triggers: the question asks to choose/compare/optimize ("which one", "compare", "what should I choose", "what should I change"),
#         but the options and/or decision criteria are not provided.
#         - If intent="clarify": ask_clarifying_question MUST be true and clarifying_question MUST be exactly one question.
#         - If intent!="clarify": ask_clarifying_question MUST be false and clarifying_question MUST be null.
#         """
#         ),

#     PromptVariant(id="ltm_strict_web_not_tools + clarify_strict + enum_guard + project architecture", system_prompt = BASE_PLANNER_SYSTEM_PROMPT + """

# Intent field constraints (STRICT):
# - intent MUST be EXACTLY one of: "generic", "freshness", "project_architecture", "clarify".
# - Do NOT output any other intent value (e.g., "compare", "choose", "optimize", "decision").

# User long-term memory policy (STRICT, HARD RULE):
# - use_user_longterm_memory MUST be true ONLY when intent is exactly "project_architecture" AND the capability is available.
# - For intents "generic", "freshness", and "clarify": use_user_longterm_memory MUST be false.

# Websearch vs Tools policy (STRICT, HARD RULE):
# - In this runtime, websearch is NOT part of tools. Websearch is a separate pipeline.
# - Therefore, if use_websearch=true, then use_tools MUST be false.
# - Set use_tools=true ONLY for non-websearch external actions handled by the tools pipeline.

# Clarify policy (STRICT, HARD RULE):
# - Use intent="clarify" when the user's request is ambiguous or missing required details to answer.
# - Triggers: the question asks to choose/compare/optimize ("which one", "compare", "what should I choose", "what should I change"),
#   but the options and/or decision criteria are not provided.
# - If intent="clarify": ask_clarifying_question MUST be true and clarifying_question MUST be exactly one question.
# - If intent!="clarify": ask_clarifying_question MUST be false and clarifying_question MUST be null.

# Project architecture trigger (STRICT):
# - If the question explicitly references the user's project/system by name (e.g., "Intergrax", "Mooff", "our runtime", "our architecture", "in our codebase"),
#   intent MUST be "project_architecture".

# Clarify priority (STRICT):
# - If the request is underspecified and refers to an unspecified "it/this/that" or missing object,
#   intent MUST be "clarify" even if the project name is mentioned.
# - Example: "How should we do it in Intergrax?" -> intent="clarify" (ask what "it" refers to).
# """
#         ),
    PromptVariant(id="base", system_prompt=DEFAULT_PLANNER_SYSTEM_PROMPT),
]

results = await run_prompt_variants(variants=variants, verbose_each=False)

for r in results:
    print(
        f"{r['variant_id']}: "
        f"acc={r['accuracy']:.1f}% "
        f"(intent={r['intent_accuracy']:.1f}%, flags={r['flags_accuracy']:.1f}%)"
    )


Planner tests:   0%|          | 0/24 [00:00<?, ?it/s]

base: acc=87.5% (intent=87.5%, flags=87.5%)


In [10]:
best = results[0]["stats"]
print("Fails:", len(best["fails"]))
for f in best["fails"]:
    print(f"- {f['test_id']} group={f['group']} intent_ok={f['intent_ok']} flags_ok={f['flags_ok']}")
    for e in f["intent_errors"]:
        print("  intent:", e)
    for e in f["flag_errors"]:
        print("  flags :", e)

Fails: 3
- P02 group=P intent_ok=False flags_ok=False
  intent: intent expected=PlanIntent.PROJECT_ARCHITECTURE got=PlanIntent.GENERIC
  flags : use_user_longterm_memory expected=True got=False
- C03 group=C intent_ok=False flags_ok=False
  intent: intent expected=PlanIntent.CLARIFY got=PlanIntent.GENERIC
  flags : ask_clarifying_question expected=True got=False
- DL03 group=D intent_ok=False flags_ok=False
  intent: intent expected=PlanIntent.PROJECT_ARCHITECTURE got=PlanIntent.GENERIC
  flags : use_user_longterm_memory expected=True got=False


In [11]:
results[0]

{'variant_id': 'base',
 'accuracy': 87.5,
 'intent_accuracy': 87.5,
 'flags_accuracy': 87.5,
 'stats': {'totals': {'total': 24,
   'pass': 21,
   'intent_ok': 21,
   'flags_ok': 21},
  'accuracy': 87.5,
  'intent_accuracy': 87.5,
  'flags_accuracy': 87.5,
  'per_group': {'G': {'total': 5,
    'pass': 5,
    'intent_ok': 5,
    'flags_ok': 5,
    'accuracy': 100.0,
    'intent_accuracy': 100.0,
    'flags_accuracy': 100.0},
   'F': {'total': 5,
    'pass': 5,
    'intent_ok': 5,
    'flags_ok': 5,
    'accuracy': 100.0,
    'intent_accuracy': 100.0,
    'flags_accuracy': 100.0},
   'P': {'total': 4,
    'pass': 3,
    'intent_ok': 3,
    'flags_ok': 3,
    'accuracy': 75.0,
    'intent_accuracy': 75.0,
    'flags_accuracy': 75.0},
   'C': {'total': 5,
    'pass': 4,
    'intent_ok': 4,
    'flags_ok': 4,
    'accuracy': 80.0,
    'intent_accuracy': 80.0,
    'flags_accuracy': 80.0},
   'D': {'total': 5,
    'pass': 4,
    'intent_ok': 4,
    'flags_ok': 4,
    'accuracy': 80.0,
    'int