# © Artur Czarnecki. All rights reserved.
# Intergrax framework – proprietary and confidential.
# Use, modification, or distribution without written permission is prohibited.

# StepPlanner – Dry-Run Test Notebook (STATIC vs DYNAMIC)

## Session Goal
Validate StepPlanner behavior without executing any tools:
- model validations
- correctness of built plans (order, depends_on, mode)
- consistency between STATIC and DYNAMIC planning for the same step types

## Testing Strategy
We will add notebook cells incrementally (one cell per message).
Configuration (imports, sys.path, planner instances, shared constants) will be collected into a single dedicated configuration cell.

In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

## LLM-driven StepPlanner tests (EnginePlan → ExecutionPlan)

We will test **real plans generated by the LLM** via `EnginePlanner.plan()`.

Pipeline:
1) Build `EnginePlan` using `build_plan(...)` (LLM-based)
2) Convert to `ExecutionPlan` using `StepPlanner.build_from_engine_plan(...)`

We will test both:
- **STATIC** (full plan built upfront)
- **DYNAMIC** (one-step plan based on `engine_plan.next_step`)

Key assertions:
- mode correctness (EXECUTE vs ITERATE)
- step order and `depends_on` correctness (for STATIC)
- single-step plans (for DYNAMIC)
- plan contains required capabilities inferred from LLM decision (websearch/ltm/rag/tools)


In [2]:
from __future__ import annotations

from dataclasses import dataclass
from typing import List, Optional

from intergrax.llm.messages import ChatMessage
from intergrax.runtime.drop_in_knowledge_mode.config import RuntimeConfig
from intergrax.runtime.drop_in_knowledge_mode.planning.engine_plan_models import EnginePlan
from intergrax.runtime.drop_in_knowledge_mode.planning.stepplan_models import (
    ExecutionPlan,
    PlanBuildMode,
)
from intergrax.runtime.drop_in_knowledge_mode.planning.plan_builder_helper import build_plan
from intergrax.runtime.drop_in_knowledge_mode.planning.step_planner import StepPlanner, StepPlannerConfig


@dataclass(frozen=True)
class PlannerContext:
    """
    All inputs required to run LLM-based planning and then build an ExecutionPlan.
    Keep this explicit and strongly typed.
    """
    config: RuntimeConfig
    step_planner: StepPlanner


def make_planner_context(*, config: RuntimeConfig, step_cfg: Optional[StepPlannerConfig] = None) -> PlannerContext:
    """
    Factory for a deterministic StepPlanner instance + runtime config.
    """
    return PlannerContext(
        config=config,
        step_planner=StepPlanner(cfg=step_cfg),
    )


async def llm_engine_plan(
    *,
    ctx: PlannerContext,
    message: str,
    user_id: str,
    session_id: Optional[str] = None,
    run_id: Optional[str] = None,
    base_history: Optional[List[ChatMessage]] = None,
) -> EnginePlan:
    """
    LLM-based: EnginePlanner.plan() -> EnginePlan
    Uses existing helper `build_plan(...)` which creates RuntimeRequest/RuntimeState internally.
    """
    return await build_plan(
        config=ctx.config,
        message=message,
        user_id=user_id,
        session_id=session_id,
        run_id=run_id,
        base_history=base_history,
    )


def execution_plan_from_engine_plan(
    *,
    ctx: PlannerContext,
    user_message: str,
    engine_plan: EnginePlan,
    build_mode: PlanBuildMode,
    plan_id: Optional[str] = None,
) -> ExecutionPlan:
    """
    Deterministic: StepPlanner converts EnginePlan -> ExecutionPlan.
    """
    return ctx.step_planner.build_from_engine_plan(
        user_message=user_message,
        engine_plan=engine_plan,
        plan_id=plan_id,
        build_mode=build_mode,
    )


  from tqdm.autonotebook import tqdm, trange





## Configuration

This cell builds:
- the LLM adapter
- `RuntimeConfig` used by the LLM-driven EnginePlanner helper (`build_plan`)
- `StepPlannerConfig` (deterministic, no LLM inside)
- `PlannerContext` (typed bundle passed into helper functions)

All configuration is explicit and centralized in this single cell.


In [3]:
# --- LLM adapter (same pattern as engine planner notebook) ---
from intergrax.llm_adapters.llm_provider import LLMProvider
from intergrax.llm_adapters.llm_provider_registry import LLMAdapterRegistry
from intergrax.runtime.drop_in_knowledge_mode.planning.stepplan_models import OutputFormat, WebSearchStrategy


llm_adapter = LLMAdapterRegistry.create(LLMProvider.OLLAMA)

# --- Runtime config used by build_plan(...) / EnginePlanner planning ---
config = RuntimeConfig(
    llm_adapter=llm_adapter,
    enable_rag=True,
    enable_websearch=True,
    tools_mode="auto",
    enable_user_longterm_memory=True,
)

# --- StepPlanner deterministic config ---
step_cfg = StepPlannerConfig(
    final_answer_style="concise_technical",
    final_format=OutputFormat.MARKDOWN,
    step_max_chars=2000,
    web_top_k=5,
    web_max_results=5,
    web_recency_days=30,
    web_strategy=WebSearchStrategy.HYBRID,
    max_total_steps=6,
    max_total_tool_calls=3,
    max_total_web_queries=5,
    max_total_chars_context=12000,
    max_total_tokens_output=None,
)

# --- Typed planner context ---
ctx = make_planner_context(config=config, step_cfg=step_cfg)

## E2E STATIC test 1: Freshness query

We test the full planning pipeline:

LLM (EnginePlanner) → EnginePlan → StepPlanner → ExecutionPlan (STATIC)

Expectations:
- LLM sets `use_websearch=True` and `intent=FRESHNESS`
- StepPlanner builds a full EXECUTE plan
- Plan contains WEBSEARCH as a pre-step
- Plan ends with FINALIZE


In [4]:
from intergrax.runtime.drop_in_knowledge_mode.planning.stepplan_models import (
    PlanBuildMode,
    PlanMode,
    StepId,
)
from intergrax.runtime.drop_in_knowledge_mode.planning.engine_plan_models import PlanIntent

TEST_MSG = "What are the most recent major changes to the OpenAI Responses API and tool calling? Provide dates."

engine_plan = await llm_engine_plan(
    ctx=ctx,
    message=TEST_MSG,
    user_id="e2e-user",
    session_id="e2e-session-static-freshness",
    run_id="e2e-static-freshness-001",
)

print("ENGINE PLAN:")
print("  intent    :", engine_plan.intent)
print("  next_step :", engine_plan.next_step)
print("  use_web   :", engine_plan.use_websearch)
print("  use_ltm   :", engine_plan.use_user_longterm_memory)
print("  use_rag   :", engine_plan.use_rag)
print("  use_tools :", engine_plan.use_tools)

plan = execution_plan_from_engine_plan(
    ctx=ctx,
    user_message=TEST_MSG,
    engine_plan=engine_plan,
    build_mode=PlanBuildMode.STATIC,
    plan_id="e2e-static-freshness-001",
)

print("\nEXECUTION PLAN:")
print("  mode  :", plan.mode)
print("  intent:", plan.intent)
print("  steps :", [s.step_id for s in plan.steps])

# --- Assertions ---

assert plan.mode == PlanMode.EXECUTE, f"Expected EXECUTE plan, got {plan.mode}"
assert plan.steps[-1].step_id == StepId.FINAL, "STATIC plan must end with FINAL"

step_ids = [s.step_id for s in plan.steps]

# Capability alignment with LLM decision
if engine_plan.use_websearch:
    assert StepId.WEBSEARCH in step_ids, "LLM requested websearch but plan has no WEBSEARCH step"
if engine_plan.use_rag:
    assert StepId.RAG in step_ids, "LLM requested rag but plan has no RAG step"
if engine_plan.use_tools:
    assert StepId.TOOLS in step_ids, "LLM requested tools but plan has no TOOLS step"
if engine_plan.use_user_longterm_memory:
    assert StepId.LTM_SEARCH in step_ids, "LLM requested LTM but plan has no LTM_SEARCH step"

print("OK: E2E STATIC Freshness test passed.")


ENGINE PLAN:
  intent    : PlanIntent.FRESHNESS
  next_step : EngineNextStep.WEBSEARCH
  use_web   : True
  use_ltm   : False
  use_rag   : False
  use_tools : False

EXECUTION PLAN:
  mode  : PlanMode.EXECUTE
  intent: PlanIntent.FRESHNESS
  steps : [<StepId.WEBSEARCH: 'websearch'>, <StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]
OK: E2E STATIC Freshness test passed.


## E2E STATIC test 2: Project architecture question (LTM expected)

Goal: verify the planner can distinguish project-internal questions from freshness questions.

Pipeline:
LLM (EnginePlanner) → EnginePlan → StepPlanner → ExecutionPlan (STATIC)

Expectations:
- LLM selects `intent=PROJECT_ARCHITECTURE`
- LLM sets `use_user_longterm_memory=True` (project context retrieval)
- LLM should NOT require websearch for an internal architecture question
- StepPlanner builds EXECUTE plan with LTM_SEARCH as a pre-step (if LTM is requested)


In [5]:
from intergrax.llm.messages import ChatMessage

BASE_HISTORY_PROJECT: str = (
    "User: I am Artur. I build Intergrax and Mooff.\n"
    "User: We are working on Intergrax Drop-In Knowledge Runtime.\n"
    "User: We are implementing a Step Planner with modes: none/static/dynamic.\n"
    "User: Observed issues: LTM always-on in generic; websearch vs tools conflation.\n"
    "User: I prefer concise, technical answers. Never use emojis in code/docs.\n"
)

TEST_MSG = "In Intergrax Drop-In Knowledge Runtime, explain how StepPlanner integrates with EnginePlanner in DYNAMIC mode."

from typing import Tuple

def assert_project_arch_usefulness(engine_plan: EnginePlan) -> Tuple[bool, str]:
    """
    Returns (ok, reason). Hard assertions are applied in the caller.
    """
    # Must NOT use websearch or tools for internal architecture questions
    if engine_plan.use_websearch:
        return False, "unexpected_websearch"
    if engine_plan.use_tools:
        return False, "unexpected_tools"

    # Must use some internal knowledge path: LTM or RAG
    if not (engine_plan.use_user_longterm_memory or engine_plan.use_rag):
        return False, "no_internal_retrieval_selected"

    return True, "ok"

TEST_MSG = "In Intergrax Drop-In Knowledge Runtime, explain how StepPlanner integrates with EnginePlanner in DYNAMIC mode."

engine_plan = await llm_engine_plan(
    ctx=ctx,
    message=TEST_MSG,
    user_id="e2e-user",
    session_id="e2e-session-static-project-arch",
    run_id="e2e-static-project-arch-003",
    base_history=[ChatMessage(role="user", content=BASE_HISTORY_PROJECT)],
)

print("ENGINE PLAN:")
print("  intent    :", engine_plan.intent)
print("  next_step :", engine_plan.next_step)
print("  use_web   :", engine_plan.use_websearch)
print("  use_ltm   :", engine_plan.use_user_longterm_memory)
print("  use_rag   :", engine_plan.use_rag)
print("  use_tools :", engine_plan.use_tools)

plan = execution_plan_from_engine_plan(
    ctx=ctx,
    user_message=TEST_MSG,
    engine_plan=engine_plan,
    build_mode=PlanBuildMode.STATIC,
    plan_id="e2e-static-project-arch-003",
)

print("\nEXECUTION PLAN:")
print("  mode  :", plan.mode)
print("  intent:", plan.intent)
print("  steps :", [s.step_id for s in plan.steps])

# Hard invariants
assert plan.mode == PlanMode.EXECUTE
assert plan.steps[-1].step_id == StepId.FINAL

# Hard usefulness expectations
ok, reason = assert_project_arch_usefulness(engine_plan)
assert ok, f"Project-arch usefulness check failed: {reason}"

# Soft metric: intent label
intent_ok = (engine_plan.intent == PlanIntent.PROJECT_ARCHITECTURE)
print("\nMETRIC intent_ok =", intent_ok, "| expected:", PlanIntent.PROJECT_ARCHITECTURE, "| got:", engine_plan.intent)

print("OK: E2E STATIC Project architecture usefulness passed (intent label tracked separately).")



ENGINE PLAN:
  intent    : PlanIntent.GENERIC
  next_step : EngineNextStep.SYNTHESIZE
  use_web   : True
  use_ltm   : False
  use_rag   : True
  use_tools : False

EXECUTION PLAN:
  mode  : PlanMode.EXECUTE
  intent: PlanIntent.GENERIC
  steps : [<StepId.RAG: 'rag'>, <StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]


AssertionError: Project-arch usefulness check failed: unexpected_websearch

## E2E DYNAMIC test 1: Freshness query (one-step ITERATE plan)

Pipeline:
LLM (EnginePlanner) → EnginePlan(next_step=WEBSEARCH) → StepPlanner → ExecutionPlan (DYNAMIC)

Expectations:
- `ExecutionPlan.mode == ITERATE`
- `len(steps) == 1`
- the single step matches `engine_plan.next_step`
  - for WEBSEARCH: step_id == WEBSEARCH


In [6]:
from intergrax.runtime.drop_in_knowledge_mode.planning.stepplan_models import (
    PlanBuildMode,
    PlanMode,
    StepId,
)
from intergrax.runtime.drop_in_knowledge_mode.planning.engine_plan_models import EngineNextStep

TEST_MSG = "What are the most recent major changes to the OpenAI Responses API and tool calling? Provide dates."

engine_plan = await llm_engine_plan(
    ctx=ctx,
    message=TEST_MSG,
    user_id="e2e-user",
    session_id="e2e-session-dynamic-freshness",
    run_id="e2e-dynamic-freshness-001",
)

print("ENGINE PLAN:")
print("  message   :", TEST_MSG)
print("  intent    :", engine_plan.intent)
print("  next_step :", engine_plan.next_step)
print("  use_web   :", engine_plan.use_websearch)
print("  use_ltm   :", engine_plan.use_user_longterm_memory)
print("  use_rag   :", engine_plan.use_rag)
print("  use_tools :", engine_plan.use_tools)

plan = execution_plan_from_engine_plan(
    ctx=ctx,
    user_message=TEST_MSG,
    engine_plan=engine_plan,
    build_mode=PlanBuildMode.DYNAMIC,
    plan_id="e2e-dynamic-freshness-001",
)

print("\nEXECUTION PLAN:")
print("  mode  :", plan.mode)
print("  intent:", plan.intent)
print("  steps :", [s.step_id for s in plan.steps])

# --- Assertions ---
assert plan.mode == PlanMode.ITERATE, f"Expected ITERATE, got {plan.mode}"
assert len(plan.steps) == 1, f"DYNAMIC plan must have exactly 1 step, got {len(plan.steps)}"

single = plan.steps[0].step_id

# Ensure StepPlanner respected the next_step mapping
if engine_plan.next_step == EngineNextStep.WEBSEARCH:
    assert single == StepId.WEBSEARCH, f"Expected WEBSEARCH step, got {single}"
elif engine_plan.next_step == EngineNextStep.RAG:
    assert single == StepId.RAG, f"Expected RAG step, got {single}"
elif engine_plan.next_step == EngineNextStep.TOOLS:
    assert single == StepId.TOOLS, f"Expected TOOLS step, got {single}"
elif engine_plan.next_step == EngineNextStep.SYNTHESIZE:
    assert single == StepId.DRAFT, f"Expected DRAFT step, got {single}"
elif engine_plan.next_step == EngineNextStep.VERIFY:
    assert single == StepId.VERIFY, f"Expected VERIFY step, got {single}"
elif engine_plan.next_step == EngineNextStep.FINALIZE:
    assert single == StepId.FINAL, f"Expected FINAL step, got {single}"
elif engine_plan.next_step == EngineNextStep.CLARIFY:
    assert single == StepId.CLARIFY, f"Expected CLARIFY step, got {single}"
else:
    raise AssertionError(f"Unhandled EngineNextStep: {engine_plan.next_step}")

print("OK: E2E DYNAMIC Freshness one-step plan passed.")


ENGINE PLAN:
  message   : What are the most recent major changes to the OpenAI Responses API and tool calling? Provide dates.
  intent    : PlanIntent.FRESHNESS
  next_step : EngineNextStep.WEBSEARCH
  use_web   : True
  use_ltm   : False
  use_rag   : False
  use_tools : False

EXECUTION PLAN:
  mode  : PlanMode.ITERATE
  intent: PlanIntent.FRESHNESS
  steps : [<StepId.WEBSEARCH: 'websearch'>]
OK: E2E DYNAMIC Freshness one-step plan passed.


## Dataset-driven tests, extended with StepPlanner validation

For each test case:
1) Run LLM EnginePlanner to produce `EnginePlan`
2) Build `ExecutionPlan` via StepPlanner in two modes:
   - STATIC  (full plan)
   - DYNAMIC (single-step plan)

We validate:
- EnginePlan: intent / flags / next_step vs expected
- ExecutionPlan STATIC: correct mode + required steps + tail
- ExecutionPlan DYNAMIC: mode ITERATE + exactly 1 step matching expected next_step mapping

In [7]:
from __future__ import annotations

from dataclasses import dataclass
from typing import List, Optional, Set, Dict, Tuple

from intergrax.runtime.drop_in_knowledge_mode.planning.engine_plan_models import (
    PlanIntent,
    EngineNextStep,
)
from intergrax.runtime.drop_in_knowledge_mode.planning.stepplan_models import StepId


@dataclass(frozen=True)
class EngineExpect:
    intent: Optional[PlanIntent] = None
    next_step: Optional[EngineNextStep] = None

    # Flags: if None -> do not assert
    use_websearch: Optional[bool] = None
    use_ltm: Optional[bool] = None
    use_rag: Optional[bool] = None
    use_tools: Optional[bool] = None


@dataclass(frozen=True)
class StepExpect:
    # STATIC expectations (subset-based, not exact chain unless you want)
    static_must_include: Set[StepId]
    static_forbid: Set[StepId]

    # DYNAMIC expectations: single step expected after mapping next_step
    dynamic_single_step: StepId


@dataclass(frozen=True)
class PlannerTestCase:
    id: str                 # e.g. "F01", "P01" like 12b groups
    question: str
    use_project_context: bool
    engine: EngineExpect
    step: StepExpect


# Minimal starter set; you can expand to your full suite like 12b
TESTS: List[PlannerTestCase] = [
    PlannerTestCase(
        id="F01",
        question="What are the most recent major changes to the OpenAI Responses API and tool calling? Provide dates.",
        use_project_context=False,
        engine=EngineExpect(
            intent=PlanIntent.FRESHNESS,
            next_step=EngineNextStep.WEBSEARCH,
            use_websearch=True,
            use_rag=False,
            use_tools=False,
            use_ltm=False,
        ),
        step=StepExpect(
            static_must_include={StepId.WEBSEARCH, StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.LTM_SEARCH, StepId.TOOLS, StepId.RAG},
            dynamic_single_step=StepId.WEBSEARCH,
        ),
    ),
    PlannerTestCase(
        id="G01",
        question="Explain the main differences between REST and GraphQL.",
        use_project_context=False,
        engine=EngineExpect(
            intent=PlanIntent.GENERIC,
            use_websearch=False,  # prefer no web for this general question
        ),
        step=StepExpect(
            static_must_include={StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.WEBSEARCH},  # allow RAG/tools depending on your policies; keep it light
            dynamic_single_step=StepId.DRAFT,  # likely SYNTHESIZE; if engine picks differently, this will flag it
        ),
    ),
]

print(f"Loaded tests: {len(TESTS)}")


Loaded tests: 2


In [8]:
from __future__ import annotations

from typing import Any

from intergrax.runtime.drop_in_knowledge_mode.planning.stepplan_models import (
    PlanBuildMode,
    PlanMode,
    ExecutionPlan,
)
from intergrax.runtime.drop_in_knowledge_mode.planning.engine_plan_models import EnginePlan


def _engine_errors(got: EnginePlan, exp: EngineExpect) -> List[str]:
    errs: List[str] = []

    if exp.intent is not None and got.intent != exp.intent:
        errs.append(f"intent expected={exp.intent} got={got.intent}")

    if exp.next_step is not None and got.next_step != exp.next_step:
        errs.append(f"next_step expected={exp.next_step} got={got.next_step}")

    def chk_flag(name: str, got_val: bool, exp_val: Optional[bool]) -> None:
        if exp_val is None:
            return
        if got_val != exp_val:
            errs.append(f"{name} expected={exp_val} got={got_val}")

    chk_flag("use_websearch", got.use_websearch, exp.use_websearch)
    chk_flag("use_ltm", got.use_user_longterm_memory, exp.use_ltm)
    chk_flag("use_rag", got.use_rag, exp.use_rag)
    chk_flag("use_tools", got.use_tools, exp.use_tools)

    return errs


def _step_errors_static(plan: ExecutionPlan, exp: StepExpect) -> List[str]:
    errs: List[str] = []
    if plan.mode != PlanMode.EXECUTE:
        errs.append(f"static mode expected=EXECUTE got={plan.mode}")

    step_ids = [s.step_id for s in plan.steps]
    missing = [s for s in exp.static_must_include if s not in step_ids]
    if missing:
        errs.append(f"static missing steps: {missing}")

    forbidden = [s for s in exp.static_forbid if s in step_ids]
    if forbidden:
        errs.append(f"static forbidden steps present: {forbidden}")

    if not plan.steps or plan.steps[-1].step_id != StepId.FINAL:
        errs.append("static plan must end with FINAL")

    # forward dependency check (crucial correctness property)
    seen = set()
    for st in plan.steps:
        for dep in (st.depends_on or []):
            if dep not in seen:
                errs.append(f"forward dependency: {st.step_id} depends_on {dep} (not seen yet)")
        seen.add(st.step_id)

    return errs


def _map_next_step_to_step_id(next_step: EngineNextStep) -> StepId:
    if next_step == EngineNextStep.WEBSEARCH:
        return StepId.WEBSEARCH
    if next_step == EngineNextStep.RAG:
        return StepId.RAG
    if next_step == EngineNextStep.TOOLS:
        return StepId.TOOLS
    if next_step == EngineNextStep.SYNTHESIZE:
        return StepId.DRAFT
    if next_step == EngineNextStep.FINALIZE:
        return StepId.FINAL
    if next_step == EngineNextStep.CLARIFY:
        return StepId.CLARIFY
    raise ValueError(f"Unhandled EngineNextStep: {next_step}")


def _step_errors_dynamic(plan: ExecutionPlan, exp: StepExpect, engine_plan: EnginePlan) -> List[str]:
    errs: List[str] = []
    if plan.mode != PlanMode.ITERATE:
        errs.append(f"dynamic mode expected=ITERATE got={plan.mode}")

    if len(plan.steps) != 1:
        errs.append(f"dynamic len(steps) expected=1 got={len(plan.steps)}")
        return errs  # can't validate further reliably

    expected_single = exp.dynamic_single_step
    got_single = plan.steps[0].step_id
    if got_single != expected_single:
        errs.append(f"dynamic single step expected={expected_single} got={got_single}")

    # Also ensure it matches engine_plan.next_step mapping (independent safety net)
    mapped = _map_next_step_to_step_id(engine_plan.next_step)
    if got_single != mapped:
        errs.append(f"dynamic mismatch vs engine next_step mapping: mapped={mapped} got={got_single}")

    return errs


In [10]:
from __future__ import annotations

from typing import Any, Dict

from tqdm.auto import tqdm
from intergrax.llm.messages import ChatMessage


BASE_HISTORY_EMPTY: str = ""

BASE_HISTORY_PROJECT: str = (
    "User: I am Artur. I build Intergrax and Mooff.\n"
    "User: We are working on Intergrax Drop-In Knowledge Runtime.\n"
    "User: We are implementing a Step Planner with modes: none/static/dynamic.\n"
    "User: Observed issues: LTM always-on in generic; websearch vs tools conflation.\n"
    "User: I prefer concise, technical answers. Never use emojis in code/docs.\n"
)


async def run_case(tc: PlannerTestCase) -> Dict[str, Any]:
    base_history_text = BASE_HISTORY_PROJECT if tc.use_project_context else BASE_HISTORY_EMPTY
    base_history = [ChatMessage(role="user", content=base_history_text)] if base_history_text else []

    engine_plan = await llm_engine_plan(
        ctx=ctx,
        message=tc.question,
        user_id="suite-user",
        session_id=f"suite-session-{tc.id}",
        run_id=f"suite-{tc.id}",
        base_history=base_history,
    )

    # Build both plans
    plan_static = execution_plan_from_engine_plan(
        ctx=ctx,
        user_message=tc.question,
        engine_plan=engine_plan,
        build_mode=PlanBuildMode.STATIC,
        plan_id=f"suite-static-{tc.id}",
    )

    plan_dynamic = execution_plan_from_engine_plan(
        ctx=ctx,
        user_message=tc.question,
        engine_plan=engine_plan,
        build_mode=PlanBuildMode.DYNAMIC,
        plan_id=f"suite-dynamic-{tc.id}",
    )

    # Validate
    eng_errs = _engine_errors(engine_plan, tc.engine)
    static_errs = _step_errors_static(plan_static, tc.step)
    dynamic_errs = _step_errors_dynamic(plan_dynamic, tc.step, engine_plan)

    # Pass criteria: all must be empty
    passed = (not eng_errs) and (not static_errs) and (not dynamic_errs)

    return {
        "id": tc.id,
        "pass": passed,
        "engine_errors": eng_errs,
        "static_errors": static_errs,
        "dynamic_errors": dynamic_errs,
        "engine_plan": engine_plan,
        "static_steps": [s.step_id for s in plan_static.steps],
        "dynamic_steps": [s.step_id for s in plan_dynamic.steps],
    }


async def run_all(verbose: bool = False) -> Dict[str, Any]:
    totals = {"total": 0, "pass": 0}
    per_group: Dict[str, Dict[str, int]] = {}
    fails: List[Dict[str, Any]] = []
    per_test: List[Dict[str, Any]] = []

    for tc in tqdm(TESTS, desc="Engine+Step planner tests"):
        res = await run_case(tc)
        per_test.append(res)

        totals["total"] += 1
        if res["pass"]:
            totals["pass"] += 1
        else:
            fails.append(
                {
                    "id": res["id"],
                    "group": res["id"][0],
                    "engine_errors": res["engine_errors"],
                    "static_errors": res["static_errors"],
                    "dynamic_errors": res["dynamic_errors"],
                }
            )

        g = tc.id[0]
        if g not in per_group:
            per_group[g] = {"total": 0, "pass": 0}
        per_group[g]["total"] += 1
        if res["pass"]:
            per_group[g]["pass"] += 1

        if verbose:
            print("\n====================")
            print("TEST", res["id"], "PASS" if res["pass"] else "FAIL")
            print("STATIC :", res["static_steps"])
            print("DYNAMIC:", res["dynamic_steps"])
            if res["engine_errors"]:
                print("ENGINE ERR:", res["engine_errors"])
            if res["static_errors"]:
                print("STATIC ERR:", res["static_errors"])
            if res["dynamic_errors"]:
                print("DYNAMIC ERR:", res["dynamic_errors"])

    def pct(x: int, denom: int) -> float:
        return 0.0 if denom <= 0 else (100.0 * float(x)) / float(denom)

    summary = {
        "totals": totals,
        "accuracy": pct(totals["pass"], totals["total"]),
        "per_group": {g: {**st, "accuracy": pct(st["pass"], st["total"])} for g, st in per_group.items()},
        "fails": fails,
        "per_test": per_test,
    }

    return summary


await run_all(verbose=True)

Engine+Step planner tests:   0%|          | 0/2 [00:00<?, ?it/s]


TEST F01 PASS
STATIC : [<StepId.WEBSEARCH: 'websearch'>, <StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]
DYNAMIC: [<StepId.WEBSEARCH: 'websearch'>]

TEST G01 PASS
STATIC : [<StepId.RAG: 'rag'>, <StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]
DYNAMIC: [<StepId.DRAFT: 'draft'>]


{'totals': {'total': 2, 'pass': 2},
 'accuracy': 100.0,
 'per_group': {'F': {'total': 1, 'pass': 1, 'accuracy': 100.0},
  'G': {'total': 1, 'pass': 1, 'accuracy': 100.0}},
 'fails': [],
 'per_test': [{'id': 'F01',
   'pass': True,
   'engine_errors': [],
   'static_errors': [],
   'dynamic_errors': [],
   'engine_plan': EnginePlan(version='1.0', intent=<PlanIntent.FRESHNESS: 'freshness'>, reasoning_summary='User wants recent changes to OpenAI Responses API and tool calling; use websearch for freshness.', ask_clarifying_question=False, clarifying_question=None, next_step=<EngineNextStep.WEBSEARCH: 'websearch'>, use_websearch=True, use_user_longterm_memory=False, use_rag=False, use_tools=False, debug={'raw_json': {'version': '1.0', 'intent': 'freshness', 'next_step': 'websearch', 'reasoning_summary': 'User wants recent changes to OpenAI Responses API and tool calling; use websearch for freshness.', 'ask_clarifying_question': False, 'clarifying_question': None, 'use_websearch': True, 'use

## Metrics summary (12b-style) + StepPlanner breakdown

We compute:
- `intent_ok`  (EnginePlan.intent vs expected when specified)
- `flags_ok`   (EnginePlan flags vs expected when specified)
- `static_ok`  (STATIC ExecutionPlan validation)
- `dynamic_ok` (DYNAMIC ExecutionPlan validation)
- `pass`       (all of the above true)

We also produce:
- per-group breakdown (F/G/P/C)
- compact failure list with the failing component(s)


In [None]:
from __future__ import annotations

from typing import Any, Dict, List, Optional, Tuple

from tqdm.auto import tqdm
from intergrax.llm.messages import ChatMessage

from intergrax.runtime.drop_in_knowledge_mode.planning.engine_plan_models import EnginePlan


def _intent_ok(got: EnginePlan, exp: EngineExpect) -> Tuple[bool, List[str]]:
    errs: List[str] = []
    if exp.intent is not None and got.intent != exp.intent:
        errs.append(f"intent expected={exp.intent} got={got.intent}")
    return (len(errs) == 0, errs)


def _flags_ok(got: EnginePlan, exp: EngineExpect) -> Tuple[bool, List[str]]:
    errs: List[str] = []

    def chk(name: str, got_val: bool, exp_val: Optional[bool]) -> None:
        if exp_val is None:
            return
        if got_val != exp_val:
            errs.append(f"{name} expected={exp_val} got={got_val}")

    chk("use_websearch", got.use_websearch, exp.use_websearch)
    chk("use_ltm", got.use_user_longterm_memory, exp.use_ltm)
    chk("use_rag", got.use_rag, exp.use_rag)
    chk("use_tools", got.use_tools, exp.use_tools)

    return (len(errs) == 0, errs)


def _next_step_ok(got: EnginePlan, exp: EngineExpect) -> Tuple[bool, List[str]]:
    errs: List[str] = []
    if exp.next_step is not None and got.next_step != exp.next_step:
        errs.append(f"next_step expected={exp.next_step} got={got.next_step}")
    return (len(errs) == 0, errs)


async def run_case_metrics(tc: PlannerTestCase) -> Dict[str, Any]:
    base_history_text = BASE_HISTORY_PROJECT if tc.use_project_context else BASE_HISTORY_EMPTY
    base_history = [ChatMessage(role="user", content=base_history_text)] if base_history_text else []

    engine_plan = await llm_engine_plan(
        ctx=ctx,
        message=tc.question,
        user_id="suite-user",
        session_id=f"suite-session-{tc.id}",
        run_id=f"suite-{tc.id}",
        base_history=base_history,
    )

    # Engine metrics
    intent_ok, intent_errs = _intent_ok(engine_plan, tc.engine)
    flags_ok, flag_errs = _flags_ok(engine_plan, tc.engine)
    next_step_ok, next_step_errs = _next_step_ok(engine_plan, tc.engine)

    # Build both plans
    plan_static = execution_plan_from_engine_plan(
        ctx=ctx,
        user_message=tc.question,
        engine_plan=engine_plan,
        build_mode=PlanBuildMode.STATIC,
        plan_id=f"suite-static-{tc.id}",
    )
    plan_dynamic = execution_plan_from_engine_plan(
        ctx=ctx,
        user_message=tc.question,
        engine_plan=engine_plan,
        build_mode=PlanBuildMode.DYNAMIC,
        plan_id=f"suite-dynamic-{tc.id}",
    )

    # StepPlanner validations
    static_errs = _step_errors_static(plan_static, tc.step)
    dynamic_errs = _step_errors_dynamic(plan_dynamic, tc.step, engine_plan)

    static_ok = (len(static_errs) == 0)
    dynamic_ok = (len(dynamic_errs) == 0)

    passed = intent_ok and flags_ok and next_step_ok and static_ok and dynamic_ok

    return {
        "id": tc.id,
        "pass": passed,
        "intent_ok": intent_ok,
        "flags_ok": flags_ok,
        "next_step_ok": next_step_ok,
        "static_ok": static_ok,
        "dynamic_ok": dynamic_ok,
        "intent_errors": intent_errs,
        "flag_errors": flag_errs,
        "next_step_errors": next_step_errs,
        "static_errors": static_errs,
        "dynamic_errors": dynamic_errs,
        "engine_plan": engine_plan,
        "static_steps": [s.step_id for s in plan_static.steps],
        "dynamic_steps": [s.step_id for s in plan_dynamic.steps],
    }


async def run_all_metrics(tests: List[PlannerTestCase], verbose: bool = False) -> Dict[str, Any]:
    totals = {
        "total": 0,
        "pass": 0,
        "intent_ok": 0,
        "flags_ok": 0,
        "next_step_ok": 0,
        "static_ok": 0,
        "dynamic_ok": 0,
    }

    per_group: Dict[str, Dict[str, int]] = {}
    fails_compact: List[Dict[str, Any]] = []
    per_test: List[Dict[str, Any]] = []

    for tc in tqdm(tests, desc="Engine+Step planner tests (metrics)"):
        res = await run_case_metrics(tc)
        per_test.append(res)

        totals["total"] += 1
        if res["pass"]:
            totals["pass"] += 1
        if res["intent_ok"]:
            totals["intent_ok"] += 1
        if res["flags_ok"]:
            totals["flags_ok"] += 1
        if res["next_step_ok"]:
            totals["next_step_ok"] += 1
        if res["static_ok"]:
            totals["static_ok"] += 1
        if res["dynamic_ok"]:
            totals["dynamic_ok"] += 1

        group = tc.id[0]
        if group not in per_group:
            per_group[group] = {
                "total": 0,
                "pass": 0,
                "intent_ok": 0,
                "flags_ok": 0,
                "next_step_ok": 0,
                "static_ok": 0,
                "dynamic_ok": 0,
            }

        per_group[group]["total"] += 1
        for k in ["pass", "intent_ok", "flags_ok", "next_step_ok", "static_ok", "dynamic_ok"]:
            if res[k]:
                per_group[group][k] += 1

        if not res["pass"]:
            fails_compact.append(
                {
                    "test_id": res["id"],
                    "group": group,
                    "intent_ok": res["intent_ok"],
                    "flags_ok": res["flags_ok"],
                    "next_step_ok": res["next_step_ok"],
                    "static_ok": res["static_ok"],
                    "dynamic_ok": res["dynamic_ok"],
                    "intent_errors": res["intent_errors"],
                    "flag_errors": res["flag_errors"],
                    "next_step_errors": res["next_step_errors"],
                    "static_errors": res["static_errors"],
                    "dynamic_errors": res["dynamic_errors"],
                }
            )

        if verbose:
            print("\n====================")
            print("TEST", res["id"], "PASS" if res["pass"] else "FAIL")
            print("ENGINE intent:", res["engine_plan"].intent, "next_step:", res["engine_plan"].next_step)
            print("ENGINE flags :", "web", res["engine_plan"].use_websearch,
                  "ltm", res["engine_plan"].use_user_longterm_memory,
                  "rag", res["engine_plan"].use_rag,
                  "tools", res["engine_plan"].use_tools)
            print("STATIC :", res["static_steps"])
            print("DYNAMIC:", res["dynamic_steps"])
            if res["intent_errors"]:
                print("INTENT ERR:", res["intent_errors"])
            if res["flag_errors"]:
                print("FLAGS ERR :", res["flag_errors"])
            if res["next_step_errors"]:
                print("STEP ERR  :", res["next_step_errors"])
            if res["static_errors"]:
                print("STATIC ERR:", res["static_errors"])
            if res["dynamic_errors"]:
                print("DYN ERR   :", res["dynamic_errors"])

    def pct(x: int, denom: int) -> float:
        return 0.0 if denom <= 0 else (100.0 * float(x)) / float(denom)

    summary = {
        "totals": totals,
        "accuracy": pct(totals["pass"], totals["total"]),
        "intent_accuracy": pct(totals["intent_ok"], totals["total"]),
        "flags_accuracy": pct(totals["flags_ok"], totals["total"]),
        "next_step_accuracy": pct(totals["next_step_ok"], totals["total"]),
        "static_accuracy": pct(totals["static_ok"], totals["total"]),
        "dynamic_accuracy": pct(totals["dynamic_ok"], totals["total"]),
        "per_group": {
            g: {
                **st,
                "accuracy": pct(st["pass"], st["total"]),
                "intent_accuracy": pct(st["intent_ok"], st["total"]),
                "flags_accuracy": pct(st["flags_ok"], st["total"]),
                "next_step_accuracy": pct(st["next_step_ok"], st["total"]),
                "static_accuracy": pct(st["static_ok"], st["total"]),
                "dynamic_accuracy": pct(st["dynamic_ok"], st["total"]),
            }
            for g, st in per_group.items()
        },
        "fails": fails_compact,
        "per_test": per_test,
    }

    return summary


await run_all_metrics(tests=TESTS)

Engine+Step planner tests (metrics):   0%|          | 0/2 [00:00<?, ?it/s]


TEST F01 PASS
ENGINE intent: PlanIntent.FRESHNESS next_step: EngineNextStep.WEBSEARCH
ENGINE flags : web True ltm False rag False tools False
STATIC : [<StepId.WEBSEARCH: 'websearch'>, <StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]
DYNAMIC: [<StepId.WEBSEARCH: 'websearch'>]

TEST G01 FAIL
ENGINE intent: PlanIntent.GENERIC next_step: EngineNextStep.SYNTHESIZE
ENGINE flags : web True ltm False rag False tools False
STATIC : [<StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]
DYNAMIC: [<StepId.DRAFT: 'draft'>]
FLAGS ERR : ['use_websearch expected=False got=True']


{'totals': {'total': 2,
  'pass': 1,
  'intent_ok': 2,
  'flags_ok': 1,
  'next_step_ok': 2,
  'static_ok': 2,
  'dynamic_ok': 2},
 'accuracy': 50.0,
 'intent_accuracy': 100.0,
 'flags_accuracy': 50.0,
 'next_step_accuracy': 100.0,
 'static_accuracy': 100.0,
 'dynamic_accuracy': 100.0,
 'per_group': {'F': {'total': 1,
   'pass': 1,
   'intent_ok': 1,
   'flags_ok': 1,
   'next_step_ok': 1,
   'static_ok': 1,
   'dynamic_ok': 1,
   'accuracy': 100.0,
   'intent_accuracy': 100.0,
   'flags_accuracy': 100.0,
   'next_step_accuracy': 100.0,
   'static_accuracy': 100.0,
   'dynamic_accuracy': 100.0},
  'G': {'total': 1,
   'pass': 0,
   'intent_ok': 1,
   'flags_ok': 0,
   'next_step_ok': 1,
   'static_ok': 1,
   'dynamic_ok': 1,
   'accuracy': 0.0,
   'intent_accuracy': 100.0,
   'flags_accuracy': 0.0,
   'next_step_accuracy': 100.0,
   'static_accuracy': 100.0,
   'dynamic_accuracy': 100.0}},
 'fails': [{'test_id': 'G01',
   'group': 'G',
   'intent_ok': True,
   'flags_ok': False,
   'ne

## Expand the test suite (starter pack)

We add a minimal but representative set of cases to cover:
- CLARIFY routing
- TOOLS routing
- RAG routing
- FRESHNESS / WEBSEARCH routing
- PROJECT_ARCHITECTURE with project context (soft intent, hard capability expectations)

Each case includes:
- expected EnginePlan intent/flags/next_step (as strict as we can be today)
- expected StepPlanner STATIC plan (must include / forbid)
- expected StepPlanner DYNAMIC plan (single step)


In [None]:
from intergrax.runtime.drop_in_knowledge_mode.planning.engine_plan_models import PlanIntent, EngineNextStep
from intergrax.runtime.drop_in_knowledge_mode.planning.stepplan_models import StepId

MORE_TESTS: List[PlannerTestCase] = [
    PlannerTestCase(
        id="F01",
        question="What are the most recent major changes to the OpenAI Responses API and tool calling? Provide dates.",
        use_project_context=False,
        engine=EngineExpect(
            intent=PlanIntent.FRESHNESS,
            next_step=EngineNextStep.WEBSEARCH,
            use_websearch=True,
            use_rag=False,
            use_tools=False,
            use_ltm=False,
        ),
        step=StepExpect(
            static_must_include={StepId.WEBSEARCH, StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.LTM_SEARCH, StepId.TOOLS, StepId.RAG},
            dynamic_single_step=StepId.WEBSEARCH,
        ),
    ),
    PlannerTestCase(
        id="G01",
        question="Explain the main differences between REST and GraphQL.",
        use_project_context=False,
        engine=EngineExpect(
            intent=PlanIntent.GENERIC,
            use_websearch=False,  # prefer no web for this general question
        ),
        step=StepExpect(
            static_must_include={StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.WEBSEARCH},  # allow RAG/tools depending on your policies; keep it light
            dynamic_single_step=StepId.DRAFT,  # likely SYNTHESIZE; if engine picks differently, this will flag it
        ),
    ),
    
    # --- CLARIFY ---    
    PlannerTestCase(
        id="C01",
        question="I have an exception in my Python code but I didn't include the traceback. What should I do?",
        use_project_context=False,
        engine=EngineExpect(
            intent=None,
            next_step=EngineNextStep.CLARIFY,
            use_websearch=False,
        ),
        step=StepExpect(
            static_must_include={StepId.CLARIFY, StepId.FINAL},
            static_forbid={StepId.WEBSEARCH, StepId.TOOLS, StepId.RAG, StepId.LTM_SEARCH},
            dynamic_single_step=StepId.CLARIFY,
        ),
    ),

    # --- TOOLS (structured action) ---
    PlannerTestCase(
        id="T01",
        question=(
            "Return the exact list of unique 'pkd' values from this JSON sorted ascending. "
            "Only output the JSON array of strings.\n\n"
            f"JSON:\n{r'''
{
  "items": [
    {"pkd": "62.01.Z"},
    {"pkd": "62.02.Z"},
    {"pkd": "62.01.Z"},
    {"pkd": "70.22.Z"}
  ]
}
'''.strip()}"
        ),
        use_project_context=False,
        engine=EngineExpect(
            intent=PlanIntent.GENERIC,
            next_step=EngineNextStep.TOOLS,
            use_tools=True,
            use_websearch=False,
        ),
        step=StepExpect(
            static_must_include={StepId.TOOLS, StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.WEBSEARCH},
            dynamic_single_step=StepId.TOOLS,
        ),
    ),

    # --- RAG (doc/KB lookup) ---
    PlannerTestCase(
        id="R01",
        question="In our runtime, what does StepPlannerConfig.max_total_steps control? Answer based on our codebase.",
        use_project_context=True,
        engine=EngineExpect(
            # This is where you *want* project intent, but currently may be GENERIC.
            intent=None,
            next_step=EngineNextStep.RAG,
            use_rag=True,
            use_websearch=False,
        ),
        step=StepExpect(
            static_must_include={StepId.RAG, StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.WEBSEARCH},
            dynamic_single_step=StepId.RAG,
        ),
    ),

    # --- FRESHNESS / WEBSEARCH (another example) ---
    PlannerTestCase(
        id="F02",
        question="What changed in Python 3.13 compared to 3.12? Provide a concise summary with dates.",
        use_project_context=False,
        engine=EngineExpect(
            intent=PlanIntent.FRESHNESS,
            next_step=EngineNextStep.WEBSEARCH,
            use_websearch=True,
            use_tools=False,
            use_rag=False,
            use_ltm=False,
        ),
        step=StepExpect(
            static_must_include={StepId.WEBSEARCH, StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.TOOLS, StepId.RAG, StepId.LTM_SEARCH},
            dynamic_single_step=StepId.WEBSEARCH,
        ),
    ),

    # --- PROJECT_ARCHITECTURE (soft intent, hard "no web", prefer internal retrieval) ---
    PlannerTestCase(
        id="P01",
        question="In Intergrax, describe the difference between STATIC and DYNAMIC plan build modes in StepPlanner.",
        use_project_context=True,
        engine=EngineExpect(
            # soft intent until EnginePlanner prompt is improved
            intent=None,
            # planner may choose RAG or SYNTHESIZE; we do not hard-fail on next_step yet
            next_step=None,
            use_websearch=False,   # hard: no web for internal question
            use_tools=False,       # hard: no tools needed
            # allow rag/ltm: at least one of them should be true, but EngineExpect can't express OR;
            # we’ll handle this as a separate “policy check” later if needed.
        ),
        step=StepExpect(
            static_must_include={StepId.DRAFT, StepId.VERIFY, StepId.FINAL},
            static_forbid={StepId.WEBSEARCH, StepId.TOOLS},
            dynamic_single_step=None,  # allow planner to choose
        )
    ),
]

print("Total tests:", len(MORE_TESTS))
print("Added:", [t.id for t in MORE_TESTS])

await run_all_metrics(tests=MORE_TESTS)

Total tests: 7
Added: ['F01', 'G01', 'C01', 'T01', 'R01', 'F02', 'P01']


Engine+Step planner tests (metrics):   0%|          | 0/7 [00:00<?, ?it/s]


TEST F01 PASS
ENGINE intent: PlanIntent.FRESHNESS next_step: EngineNextStep.WEBSEARCH
ENGINE flags : web True ltm False rag False tools False
STATIC : [<StepId.WEBSEARCH: 'websearch'>, <StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]
DYNAMIC: [<StepId.WEBSEARCH: 'websearch'>]

TEST G01 PASS
ENGINE intent: PlanIntent.GENERIC next_step: EngineNextStep.SYNTHESIZE
ENGINE flags : web False ltm False rag False tools False
STATIC : [<StepId.DRAFT: 'draft'>, <StepId.VERIFY: 'verify'>, <StepId.FINAL: 'final'>]
DYNAMIC: [<StepId.DRAFT: 'draft'>]

TEST C01 PASS
ENGINE intent: PlanIntent.CLARIFY next_step: EngineNextStep.CLARIFY
ENGINE flags : web False ltm False rag False tools False
STATIC : [<StepId.CLARIFY: 'clarify'>, <StepId.FINAL: 'final'>]
DYNAMIC: [<StepId.CLARIFY: 'clarify'>]

TEST T01 PASS
ENGINE intent: PlanIntent.GENERIC next_step: EngineNextStep.TOOLS
ENGINE flags : web False ltm True rag False tools True
STATIC : [<StepId.TOOLS: 'tools'>, <StepId.DRAFT: 

{'totals': {'total': 7,
  'pass': 6,
  'intent_ok': 7,
  'flags_ok': 7,
  'next_step_ok': 7,
  'static_ok': 7,
  'dynamic_ok': 6},
 'accuracy': 85.71428571428571,
 'intent_accuracy': 100.0,
 'flags_accuracy': 100.0,
 'next_step_accuracy': 100.0,
 'static_accuracy': 100.0,
 'dynamic_accuracy': 85.71428571428571,
 'per_group': {'F': {'total': 2,
   'pass': 2,
   'intent_ok': 2,
   'flags_ok': 2,
   'next_step_ok': 2,
   'static_ok': 2,
   'dynamic_ok': 2,
   'accuracy': 100.0,
   'intent_accuracy': 100.0,
   'flags_accuracy': 100.0,
   'next_step_accuracy': 100.0,
   'static_accuracy': 100.0,
   'dynamic_accuracy': 100.0},
  'G': {'total': 1,
   'pass': 1,
   'intent_ok': 1,
   'flags_ok': 1,
   'next_step_ok': 1,
   'static_ok': 1,
   'dynamic_ok': 1,
   'accuracy': 100.0,
   'intent_accuracy': 100.0,
   'flags_accuracy': 100.0,
   'next_step_accuracy': 100.0,
   'static_accuracy': 100.0,
   'dynamic_accuracy': 100.0},
  'C': {'total': 1,
   'pass': 1,
   'intent_ok': 1,
   'flags_ok': 