In [1]:
!pip install --quiet -U langchain langchain_openai langgraph langchainhub langchain_experimental

In [2]:
%pip install -qU langchain-unstructured langchain-community langchain-deepseek langchain-google-genai langchain_xai

Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install python-dotenv



In [6]:
from dotenv import load_dotenv
import os

# Load variables from .env file
load_dotenv()

# Now set them manually if needed, or access directly with os.getenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["DEEPSEEK_API_KEY"] = os.getenv("DEEPSEEK_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
os.environ["XAI_API_KEY"] = os.getenv("XAI_API_KEY")

os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2", "false")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")


# SAD Agents v4


In [8]:
import re
from typing import List, Tuple, Dict
from typing_extensions import Annotated, Literal, TypedDict

# langchain imports (schema, messages, prompts, tools)
from langchain.schema import HumanMessage, BaseMessage
from langchain_core.messages import (
    RemoveMessage,
    AIMessage,
    HumanMessage,  # Already imported above, could remove one
    trim_messages,
    SystemMessage,
    ToolMessage
)
from langchain_core.tools import tool
from langchain_core.tools.base import InjectedToolCallId

# langgraph imports
from langgraph.graph.message import add_messages
from langgraph.prebuilt import InjectedState, chat_agent_executor
from langgraph.types import Command

# model providers
from langchain_openai import ChatOpenAI
from langchain_deepseek import ChatDeepSeek

# prompts
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

import functools


## State

In [9]:
from typing import TypedDict, Annotated, Sequence, Dict, List
from collections import deque
from langchain_core.messages import BaseMessage

# A standard structure for the outputs of each level/diagram
class LevelOutput(TypedDict, total=False):
    """Holds the artifacts for a single C4 diagram."""
    analysis: str  # The textual analysis from the agent
    yaml_definition: str  # The structured YAML output
    diagram: str  # The generated PlantUML diagram code

# The main TypedDict to hold our entire C4 model as it's built
class C4Model(TypedDict):
    """A nested dictionary to store all generated C4 model artifacts."""
    context: LevelOutput
    containers: LevelOutput
    # Components are stored in a dict, keyed by the container's name
    components: Dict[str, LevelOutput]

# The final state for our LangGraph application
class State(TypedDict):
    messages: Sequence[BaseMessage]
    system_brief: str
    c4_model: C4Model
    component_queue: deque[str]

In [10]:
from langchain_openai import ChatOpenAI
from langchain_deepseek import ChatDeepSeek
from langchain_google_genai import ChatGoogleGenerativeAI

# --- 1. Setup the Language Model ---
# It's recommended to set the API key as an environment variable
# os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
temperature = 0
# llm = ChatOpenAI(model="gpt-4o", temperature=0)
# llm = ChatDeepSeek(model="deepseek-chat", temperature=temperature)
# llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro-preview-06-05",temperature=temperature)
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20",temperature=temperature)

## Analysis agent team subgraph

#### Agent personas

In [11]:
from dataclasses import dataclass

@dataclass
class Agent:
    """A simple dataclass to hold agent information."""
    name: str
    persona: str

# ==============================================================================
# --- Level 1 (System Context) Team ---
# Goal: High-level, business-focused view of users and external systems.
# ==============================================================================

context_level_team = [
    Agent(
        name="Product_Owner",
        persona="""You are a visionary Product Owner. Your primary goal is to represent the user's voice and ensure the system delivers clear business value.

Your focus is on:
- **Defining the 'Why':** The core problem this system solves and for whom.
- **User Personas:** Clearly identifying the different types of users and their motivations.
- **Business Goals:** Ensuring the system's scope aligns with strategic objectives.

Your contribution is to **champion the user's perspective.** You must define the 'People' who will use the system and relentlessly question how the proposed design serves their needs."""
    ),
    Agent(
        name="Business_Analyst",
        persona="""You are a pragmatic Business Analyst. Your primary goal is to map business requirements to system capabilities and define the system's boundaries.

Your focus is on:
- **System Scope:** The system's core purpose and functional boundaries.
- **External Interactions:** Identifying all key external systems the system must interact with.
- **High-Level Data Flow:** Mapping the essential data exchange with external systems.

Your contribution is to **define the 'External Systems' and the system's boundary.** You must clearly state the external dependencies required and how they fit into the overall business process."""
    ),
    Agent(
        name="Lead_Software_Architect",
        persona="""You are a high-level Lead Software Architect. Your goal is to provide an early technical feasibility assessment, treating the entire system as a single black box.

Your focus is on:
- **Technical Sanity Check:** Ensuring the high-level business goals are technically plausible.
- **System Responsibilities:** Summarizing the system's core function from a technical standpoint.
- **Identifying Major Constraints:** Pointing out significant technical or resource constraints early.

Your contribution is to **ground the business vision in technical reality.** You translate the discussion into a concise, high-level technical summary of the system's purpose and interactions."""
    ),
]


# ==============================================================================
# --- Level 2 (Containers) Team ---
# Goal: Decompose the system into major technical building blocks.
# ==============================================================================

container_level_team = [
    Agent(
        name="Software_Architect",
        persona="""You are a hands-on Software Architect. Having understood the high-level context, your goal is now to design the system's macro-architecture.

Your focus is on:
- **Container Decomposition:** Breaking the system into logical, deployable units (e.g., Web App, API, Database, Message Queue).
- **Technology Choices:** Proposing the primary technology stack for each container.
- **High-Level Relationships:** Defining how containers communicate and the protocols they use.

Your contribution is to **propose a clear set of 'Containers' and the 'Relationships' between them,** justifying your architectural patterns and technology choices."""
    ),
    Agent(
        name="Lead_Developer",
        persona="""You are a practical Lead Developer. Your goal is to ensure the proposed architecture is buildable and maintainable for the development team.

Your focus is on:
- **Implementation Feasibility:** Assessing the complexity of the proposed containers.
- **Technology Trade-offs:** Debating the pros and cons of technology choices (e.g., specific frameworks, databases).
- **Developer Experience:** Considering how easy it will be for a team to build and work with this design.

Your contribution is to **validate or challenge the architectural design from a hands-on implementation perspective,** suggesting concrete alternatives where appropriate."""
    ),
    Agent(
        name="DevOps_Specialist",
        persona="""You are a production-focused DevOps Specialist. Your goal is to ensure the system is designed for operational excellence.

Your focus is on:
- **Deployability:** How the proposed containers will be packaged and deployed.
- **Observability:** How the system will be monitored for health and performance.
- **Scalability & Reliability:** Identifying potential operational bottlenecks and failure modes.

Your contribution is to **critique the proposed architecture from an operational standpoint,** pointing out potential issues with deployment, scaling, or monitoring."""
    ),
    Agent(
        name="Security_Specialist",
        persona="""You are a vigilant Security Specialist. Your goal is to establish a secure foundation for the system at the container level.

Your focus is on:
- **Trust Boundaries:** Defining the security posture of communications between containers.
- **Data Protection in Transit:** Specifying security requirements for APIs and data flows (e.g., TLS, mTLS).
- **Authentication Gateways:** Assessing where and how initial authentication should be handled (e.g., API Gateway).

Your contribution is to **analyze the proposed container relationships for security flaws** and recommend high-level security controls."""
    ),
]


# ==============================================================================
# --- Level 3 (Components) Team ---
# Goal: Detail the internal design of a single container.
# ==============================================================================

component_level_team = [
    Agent(
        name="Lead_Developer",
        persona="""You are the Lead Developer for this specific container. Your goal is to drive the detailed internal design of your service.

Your focus is on:
- **Component Decomposition:** Breaking your container into key logical components (e.g., API Controller, Service, Repository, Domain Model).
- **Internal APIs & Interfaces:** Defining the contracts and responsibilities for each component.
- **Design Patterns:** Selecting and applying appropriate design patterns for a clean and maintainable internal structure.

Your contribution is to **propose a clear set of 'Components' and their 'Relationships' inside this container,** leading the detailed design discussion."""
    ),
    Agent(
        name="Senior_Developer",
        persona="""You are a Senior Developer on the team. Your goal is to contribute to a high-quality component design by focusing on implementation details and best practices.

Your focus is on:
- **Code-Level Design:** Considering class and function responsibilities.
- **Adherence to Patterns:** Ensuring the proposed design patterns are used correctly.
- **Testability:** Thinking about how the proposed components can be effectively unit-tested.

Your contribution is to **refine the detailed component design,** ask clarifying questions, and suggest improvements based on your implementation experience."""
    ),
    Agent(
        name="Database_Administrator",
        persona="""You are a Database Administrator. Your goal is to ensure the data model for this container is efficient, secure, and reliable.

Your focus is on:
- **Schema Design:** The specific tables, columns, and data types for components that interact with the database.
- **Query Performance:** How components will query data and whether indexes are needed.
- **Data Integrity:** Ensuring relationships and constraints are correctly defined.

Your contribution is to **define and critique the database-related aspects of the component design.** You are only active if the container has a database."""
    ),
    Agent(
        name="Security_Specialist",
        persona="""You are a vigilant Security Specialist, now focusing on the internals of this container. Your goal is to find and mitigate vulnerabilities at the code level.

Your focus is on:
- **Input Validation & Sanitization:** Ensuring data entering each component is safe.
- **Fine-Grained Authorization:** How permissions are checked within the component's logic.
- **Secure Coding Practices:** Identifying potential risks like injection flaws or improper error handling.

Your contribution is to **analyze the internal component design for security vulnerabilities** and recommend specific, code-level security controls."""
    ),
]

#### subgraph

In [None]:
import functools
import os
import re # <<< ADDED: Missing import for sanitization
from dataclasses import dataclass
from typing import List, Annotated, TypedDict

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, BaseMessage
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
# from langchain_deepseek import ChatDeepSeek # Your LLM import

# --- ADDED: Agent dataclass for context ---
@dataclass
class Agent:
    """A simple dataclass to hold agent information."""
    name: str
    persona: str

# --- State is now more generic ---
class CollaborativeAnalysisState(TypedDict):
    """Internal state for the collaborative analysis subgraph."""
    messages: Annotated[List[BaseMessage], add_messages]
    system_brief: str
    context: str
    level: str
    max_rounds: int
    final_analysis: str
    team: List[Agent] # <<< NEW: The team of agents for the current collaboration

# --- agent_node and report_generator_node require NO CHANGES ---
def agent_node(state: CollaborativeAnalysisState, agent: Agent, llm: BaseChatModel) -> dict:
    print(f"--- 🗣️  Turn: {agent.name} on C4 Level: '{state['level']}' ---")
    system_prompt = f"""You are a member of an expert team collaboratively creating the analysis for a C4 model diagram.
    Your current task is to analyze the provided system brief for the **C4 {state['level']} level**.
    {state['context']}
    Your specific role is as follows:
    ---
    {agent.persona}
    ---
    Read the conversation history and add your next insight based on your specific role. Provide your analysis directly and concisely."""
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        MessagesPlaceholder(variable_name="messages"),
    ])
    chain = prompt_template | llm
    response = chain.invoke({"messages": state["messages"]})
    sanitized_name = re.sub(r"[^a-zA-Z0-9_-]", "_", agent.name)
    named_message = AIMessage(content=response.content, name=sanitized_name)
    return {"messages": [named_message]}


def report_generator_node(state: CollaborativeAnalysisState, llm: BaseChatModel) -> dict:
    print("--- 🔬 Generating Final Analysis Report ---")
    system_prompt = """You are a meticulous Scribe-Agent for a C4 model design session. Your sole mission is to create a single, comprehensive, and exhaustive transcript of the architectural decisions made.

**This output is critical as it will be used as a direct input for an automated process, so it must be a complete and unfiltered record of the facts.**

Your role is to **compile and integrate** every insight from the collaborative discussion into a final, consolidated report.

**CRITICAL RULES:**
1.  **DO NOT SUMMARIZE:** Your task is to collate and transcribe, not to condense or interpret. Every point, proposal, critique, and decision must be captured.
2.  **PRESERVE EVERY FACT:** No detail is too small. If a technology, version number, security concern, or component name was mentioned as part of a final decision, it must be included in the report.
3.  **CONSOLIDATE WITHOUT OMISSION:** You must logically structure the final output, but you must ensure that this consolidation process does not lead to any information loss. Integrate all points into one coherent document.

---
**Reference Context: Original System Brief**
{system_brief}
---

Now, review the ENTIRE conversation history and generate the final, all-inclusive, consolidated analysis report based on these strict rules.
"""
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        MessagesPlaceholder(variable_name="messages"),
    ])
    chain = prompt_template | llm
    final_report = chain.invoke({
        "system_brief": state["system_brief"],
        "messages": state["messages"]
    }).content
    return {"final_analysis": final_report}


# --- Router now gets the team from the STATE ---
def collaboration_router(state: CollaborativeAnalysisState) -> str:
    """Routes the conversation based on the number of rounds completed."""

    # <<< CHANGED: Access the team from the state dictionary >>>
    active_team = state['team']

    # The number of AI turns is the total messages minus the initial human message
    num_ai_turns = len(state['messages']) - 1

    # The number of full rounds is the AI turns divided by the number of agents in the current team
    rounds_completed = num_ai_turns // len(active_team)

    if rounds_completed >= state["max_rounds"]:
        print(f"--- ✅ Collaboration Complete: Max rounds ({state['max_rounds']}) reached. ---")
        return "generate_report"
    else:
        # Otherwise, loop back to the first agent to start the next round
        return active_team[0].name


# --- Graph Factory now accepts a `team` parameter ---
def create_collaboration_graph(llm: BaseChatModel, team: List[Agent], max_rounds: int = 2):
    """
    Builds and returns a compiled collaborative analysis subgraph for a GIVEN TEAM.
    """

    builder = StateGraph(CollaborativeAnalysisState)

    # <<< CHANGED: Iterate over the passed-in `team` parameter >>>
    for agent in team:
        # Sanitize the agent name for the node name to be safe
        node_name = re.sub(r"[^a-zA-Z0-9_-]", "_", agent.name)
        builder.add_node(node_name, functools.partial(agent_node, agent=agent, llm=llm))

    builder.add_node("generate_report", functools.partial(report_generator_node, llm=llm))

    # Set the entry point to the first agent of the given team
    entry_point = re.sub(r"[^a-zA-Z0-9_-]", "_", team[0].name)
    builder.set_entry_point(entry_point)

    # Create the chain: Agent 1 -> Agent 2 -> etc. for the given team
    for i in range(len(team) - 1):
        source_node = re.sub(r"[^a-zA-Z0-9_-]", "_", team[i].name)
        target_node = re.sub(r"[^a-zA-Z0-9_-]", "_", team[i+1].name)
        builder.add_edge(source_node, target_node)

    # The last agent in the team calls the router
    last_agent_name = re.sub(r"[^a-zA-Z0-9_-]", "_", team[-1].name)
    first_agent_name = re.sub(r"[^a-zA-Z0-9_-]", "_", team[0].name)

    builder.add_conditional_edges(
        last_agent_name,
        collaboration_router,
        {
            "generate_report": "generate_report",
            # The key must also be the sanitized name
            first_agent_name: first_agent_name
        }
    )

    builder.add_edge("generate_report", END)

    return builder.compile()

## Analysis node

In [15]:
import os
from typing import Dict

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import copy # Import the copy module for deepcopy




# --- 2. Create a Reusable Analysis Chain ---
def create_analysis_chain(level: str, persona_prompt: str, llm):
    """Creates an LLM chain specifically for generating C4 analysis."""

    prompt_template = ChatPromptTemplate.from_messages([
        ("system", persona_prompt),
        ("human",
         """Analyze the following System Design Brief to produce the textual analysis for the C4 **{level}** level.

         **System Design Brief:**
         ```yaml
         {brief}
         ```

         {context}

         **Your Task:**
         Generate a clear, well-structured textual analysis ONLY. Do NOT generate YAML or any diagram code.
         - For 'context' level: Identify the system, the key user roles (People), and all external system dependencies.
         - For 'container' level: Identify the key deployable containers within the system, their technology choices, and their relationships.
         - For 'component' level: Identify the main components inside the '{component_target}' container.

         Focus on clearly defining elements and the reasoning for their relationships based on the brief.""")
    ])

    return prompt_template | llm | StrOutputParser()

# --- 3. Define the Agent Node Function ---
def analysis_agent_node(state: State, llm) -> Dict:
    """
    Agent node that generates the textual analysis for the next required C4 level.
    """
    system_brief = state["system_brief"]
    c4_model = state["c4_model"]
    # Initialize component_target to None
    component_target = None

    # State-driven logic: determine which analysis to perform
    if not c4_model.get("context"):
        level = "context"
        print("--- ✍️ Generating Context Level Analysis ---")
        context = ""
    elif not c4_model.get("containers"):
        level = "container"
        print("--- ✍️ Generating Container Level Analysis ---")
        # Provide the context analysis as additional context for the LLM
        context = f"**Context Level Analysis (for context):**\n{c4_model['context']['analysis']}"
    else:
        # Check the queue for the next component to analyze
        component_queue = state.get("component_queue", [])
        if component_queue:
            component_target = component_queue[0] # Peek at the next item
            level = "component"
            print(f"--- ✍️ Generating Component Level Analysis for '{component_target}' ---")
            context = f"""**Container Level Analysis (for context):**
                         \n{c4_model['containers']['analysis']}"""
        else:
            # Nothing left to do
            return {}

    # Create the appropriate chain and invoke it
    persona = "You are an expert software architect specializing in the C4 model."
    analysis_chain = create_analysis_chain(level, persona, llm=llm)
    analysis = analysis_chain.invoke({
        "level": level,
        "brief": system_brief,
        "context": context,
        "component_target": component_target or ""
    })

    # 1. Create a deep copy of the model to avoid modifying the state directly.
    updated_model = copy.deepcopy(c4_model)

    # 2. Initialize and update the correct nested dictionary in the copy.
    if level == "context":
        updated_model["context"] = {"analysis": analysis}
    elif level == "container":
        updated_model["containers"] = {"analysis": analysis}
    elif level == "component" and component_target:
        # This ensures the 'components' dictionary itself exists
        if "components" not in updated_model:
            updated_model["components"] = {}
        # This creates the nested dictionary for the specific component
        updated_model["components"][component_target] = {"analysis": analysis}

    # 3. Return the entire updated model.
    return {"c4_model": updated_model}

## YAML structure node

In [16]:
import os
from typing import Dict
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import copy # Import the copy module for deepcopy


# --- 2. Store YAML Templates ---
# These templates are provided to the LLM as examples of the desired output format.

CONTEXT_YAML_TEMPLATE = """
# C4 Model: Level 1 - System Context
level: context
scope: "System Context diagram for [System Name]"
system:
  name: "[System Name]"
  description: "[High-level description of the system's purpose and value.]"
elements:
  - type: "person"
    name: "[User Role A]"
    description: "[Description of this user and their goals.]"
  - type: "externalSystem"
    name: "[External System A]"
    description: "[Description of the external system and its function.]"
relationships:
  - source: "[User Role A]"
    destination: "[System Name]"
    description: "[Description of the interaction]"
    technology: "[e.g., HTTPS]"
"""

CONTAINER_YAML_TEMPLATE = """
# C4 Model: Level 2 - Container
level: container
scope: "Container diagram for [System Name]"
system:
  name: "[System Name]"
elements:
  # People and External Systems from Level 1 that interact with the containers
  - type: "person"
    name: "[User Role A]"
    description: "[Description of this user.]"
  - type: "externalSystem"
    name: "[External System A]"
    description: "[Description of this external system.]"
  # Containers within the system boundary
  - type: "container"
    name: "[Container A, e.g., Web Application]"
    technology: "[e.g., React, Angular]"
    description: "[Responsibility of this container.]"
relationships:
  - source: "[User Role A]"
    destination: "[Container A, e.g., Web Application]"
    description: "[e.g., Uses]"
    technology: "[e.g., HTTPS]"
"""

# A plausible template for the Component level
COMPONENT_YAML_TEMPLATE = """
# C4 Model: Level 3 - Component
level: component
scope: "Component diagram for the [Parent Container Name] container"
parentContainer:
  name: "[Parent Container Name]"
elements:
  # Components within the parent container's boundary
  - type: "component"
    name: "[Component A, e.g., Order Controller]"
    technology: "[e.g., Spring MVC Controller]"
    description: "[Responsibility of this component.]"
relationships:
  - source: "[Component A, e.g., Order Controller]"
    destination: "[Component B, e.g., Order Service]"
    description: "[e.g., Invokes]"
"""

# --- 3. Define the Agent Node Function ---

def yaml_structure_node(state: State, llm) -> Dict:
    """
    Agent node that generates a structured YAML string based on the textual analysis.
    """
    c4_model = state["c4_model"]
    component_target = None

    # State-driven logic: determine which YAML to generate
    # Check Context Level
    if c4_model.get("context", {}).get("analysis") and not c4_model.get("context", {}).get("yaml_definition"):
        level = "context"
        print("--- 📝 Generating Context Level YAML ---")
        analysis = c4_model["context"]["analysis"]
        template = CONTEXT_YAML_TEMPLATE
        context = ""
    # Check Container Level
    elif c4_model.get("containers", {}).get("analysis") and not c4_model.get("containers", {}).get("yaml_definition"):
        level = "container"
        print("--- 📝 Generating Container Level YAML ---")
        analysis = c4_model["containers"]["analysis"]
        template = CONTAINER_YAML_TEMPLATE
        context = f"Context Level YAML (for reference):\n{c4_model['context']['yaml_definition']}"
    # Check Component Level
    else:
        component_queue = state.get("component_queue", [])
        if component_queue:
            component_target = component_queue[0] # Peek at the next item
            if c4_model.get("components", {}).get(component_target, {}).get("analysis") and not c4_model.get("components", {}).get(component_target, {}).get("yaml_definition"):
                level = "component"
                print(f"--- 📝 Generating Component Level YAML for '{component_target}' ---")
                analysis = c4_model["components"][component_target]["analysis"]
                template = COMPONENT_YAML_TEMPLATE
                context = f"Container Level YAML (for reference):\n{c4_model['containers']['yaml_definition']}"
            else:
                return {} # This component's analysis isn't ready or its YAML is done
        else:
            return {} # Nothing left to do

    # --- Create the LLM Chain for this task ---
    persona = "You are a meticulous software architect. Your task is to convert a textual analysis into a structured YAML file. You must adhere strictly to the provided template."
    prompt = ChatPromptTemplate.from_messages([
        ("system", persona),
        ("human", """Based on the provided textual analysis, generate a YAML file that strictly follows the structure of the YAML template.

        **Textual Analysis:**
        ```
        {analysis}
        ```

        {context}

        **YAML Template (Your output MUST follow this format):**
        ```yaml
        {template}
        ```

        **Instructions:**
        - Populate all fields in the template based on the analysis.
        - Do NOT deviate from the template's structure.
        - Your output must be ONLY the raw YAML string, starting with `level: ...`. Do not add any commentary, explanations, or markdown fences like ```yaml.
        """),
    ])

    chain = prompt | llm | StrOutputParser()

    # --- Invoke the chain and prepare the state patch ---
    yaml_output = chain.invoke({
        "analysis": analysis,
        "template": template,
        "context": context,
    })


    # 1. Create a deep copy of the model to avoid modifying the state directly.
    updated_model = copy.deepcopy(c4_model)

    # 2. Add the new key to the *existing* nested dictionary.
    if level == "context":
        updated_model["context"]["yaml_definition"] = yaml_output
    elif level == "container":
        updated_model["containers"]["yaml_definition"] = yaml_output
    elif level == "component" and component_target:
        # This logic was already better, but we'll make it consistent
        if component_target not in updated_model["components"]:
            updated_model["components"][component_target] = {}
        updated_model["components"][component_target]["yaml_definition"] = yaml_output

    # 3. Return the entire updated model. LangGraph will merge this correctly.
    return {"c4_model": updated_model}


## PlantUML Diagram Node

In [17]:
import os
from typing import Dict
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


# --- 2. Store PlantUML Syntax Guide ---
# This guide is crucial for ensuring the LLM generates valid C4 PlantUML.
PLANTUML_SYNTAX_GUIDE = """
You must use the C4-PlantUML library syntax. Here are the key elements:

1.  **Header:** Always start with `@startuml` and include the C4_Context.puml, C4_Container.puml, or C4_Component.puml file.
    ```plantuml
    @startuml
    !include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Context.puml
    LAYOUT_WITH_LEGEND()
    ```

2.  **Elements:** Define elements using these functions. Use the 'name' from YAML as the alias and label.
    - `Person(alias, label, description)`
    - `System(alias, label, description)`
    - `System_Ext(alias, label, description)`
    - `Container(alias, label, technology, description)`
    - `ContainerDb(alias, label, technology, description)`
    - `Component(alias, label, technology, description)`

3.  **Boundaries:** Use boundaries for container and component diagrams.
    - `System_Boundary(alias, label) { ... elements ... }`
    - `Container_Boundary(alias, label) { ... elements ... }`

4.  **Relationships:** Connect elements with `Rel`.
    - `Rel(source_alias, destination_alias, label, technology)`
"""


# --- 3. Define the Agent Node Function ---

def plantuml_diagram_node(state: State, llm) -> Dict:
    """
    Agent node that generates a PlantUML diagram string from the analysis and YAML.
    """
    c4_model = state["c4_model"]
    component_target = None

    # State-driven logic: determine which diagram to generate
    # Check if a level has analysis and yaml, but no diagram
    # Check Context Level
    if c4_model.get("context", {}).get("yaml_definition") and not c4_model.get("context", {}).get("diagram"):
        level = "context"
        print(f"--- 🎨 Generating {level.capitalize()} Level Diagram ---")
        analysis = c4_model["context"]["analysis"]
        yaml_def = c4_model["context"]["yaml_definition"]
    # Check Container Level
    elif c4_model.get("containers", {}).get("yaml_definition") and not c4_model.get("containers", {}).get("diagram"):
        level = "container"
        print(f"--- 🎨 Generating {level.capitalize()} Level Diagram ---")
        analysis = c4_model["containers"]["analysis"]
        yaml_def = c4_model["containers"]["yaml_definition"]
    # Check Component Level
    else:
        component_queue = state.get("component_queue", [])
        if component_queue:
            component_target = component_queue[0]  # Peek
            # Check if this component has yaml but no diagram
            if c4_model.get("components", {}).get(component_target, {}).get("yaml_definition") and not c4_model.get("components", {}).get(component_target, {}).get("diagram"):
                level = "component"
                print(f"--- 🎨 Generating {level.capitalize()} Level Diagram for '{component_target}' ---")
                analysis = c4_model["components"][component_target]["analysis"]
                yaml_def = c4_model["components"][component_target]["yaml_definition"]
            else:
                return {} # This component isn't ready or is already done
        else:
            return {} # Nothing left to do

    # --- Create the LLM Chain for this task ---
    persona = "You are an expert software architect and a specialist in generating C4 diagrams using PlantUML. Your task is to convert a YAML definition into a valid PlantUML diagram, using the accompanying analysis for context."
    prompt = ChatPromptTemplate.from_messages([
        ("system", persona),
        ("human", """Generate a C4 PlantUML diagram based on the provided YAML definition and textual analysis.

        **Reference Syntax Guide:**
        ```plantuml
        {syntax_guide}
        ```

        **Source YAML Definition:**
        ```yaml
        {yaml_def}
        ```

        **Source Textual Analysis (for context on relationships and descriptions):**
        ```
        {analysis}
        ```

        **Instructions:**
        - Convert every element and relationship from the YAML file into the correct PlantUML syntax.
        - Use the `analysis` text to write better descriptions for relationships if needed.
        - Your output must be ONLY the raw PlantUML code, starting with `@startuml`. Do not add any commentary, explanations, or markdown fences like ```plantuml.
        """),
    ])

    chain = prompt | llm | StrOutputParser()

    # --- Invoke the chain and prepare the state patch ---
    diagram_code = chain.invoke({
        "syntax_guide": PLANTUML_SYNTAX_GUIDE,
        "yaml_def": yaml_def,
        "analysis": analysis,
    })


    # 1. Create a deep copy of the model to avoid modifying the state directly.
    updated_model = copy.deepcopy(c4_model)

    # 2. Add the new key to the *existing* nested dictionary in the copy.
    if level == "context":
        updated_model["context"]["diagram"] = diagram_code
    elif level == "container":
        updated_model["containers"]["diagram"] = diagram_code
    elif level == "component" and component_target:
        if component_target not in updated_model["components"]:
            updated_model["components"][component_target] = {}
        updated_model["components"][component_target]["diagram"] = diagram_code

    # 3. Return the entire updated model. LangGraph will merge this correctly.
    return {"c4_model": updated_model}

## Utility nodes and routers

In [18]:
import yaml


def populate_component_queue_node(state: State) -> Dict:
    """
    Parses the container YAML to find container names and adds them to the queue.
    """
    print("--- ⚙️ Populating Component Queue ---")
    container_yaml = state["c4_model"]["containers"]["yaml_definition"]
    try:
        data = yaml.safe_load(container_yaml)
        # Find all elements of type 'container'
        container_names = [
            elem["name"] for elem in data.get("elements", []) if elem.get("type") == "container"
        ]
        print(f"Found containers to process: {container_names}")
        return {"component_queue": deque(container_names)}
    except yaml.YAMLError as e:
        print(f"Error parsing YAML: {e}")
        return {} # No change to the queue on error

def complete_component_node(state: State) -> Dict:
    """
    Pops the completed component from the front of the queue.
    """
    print("--- ✅ Completing Component Task ---")
    queue = state["component_queue"]
    if queue:
        completed_item = queue.popleft()
        print(f"Finished processing: {completed_item}")
    return {"component_queue": queue}

def should_process_components(state: State) -> str:
    """
    Router that checks the component queue to decide whether to continue or end.
    """
    print("--- 🤔 Checking Component Queue ---")
    if state["component_queue"]:
        print("Queue is not empty. Processing next component.")
        return "process_component"
    else:
        print("Queue is empty. Finishing workflow.")
        return "end_workflow"

# 2. After a diagram is created, decide what to do next
def post_diagram_router(state: State) -> str:
    """Decides the next step after any diagram is generated."""
    # If context is done but container isn't, start container analysis
    if state["c4_model"]["context"].get("diagram") and not state["c4_model"].get("containers"):
        return "analysis"
    # If container is done, populate the component queue
    elif state["c4_model"].get("containers", {}).get("diagram") and state["component_queue"] is None:
         return "populate_queue"
    # If a component diagram was just made, mark it as complete
    else:
        return "complete_component"

## Build graph

### Choose LLM model

In [19]:
from typing import Literal
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_deepseek import ChatDeepSeek
from langchain_xai import ChatXAI


MODEL_PROVIDER_MAP = {
    "gemini-1.5-flash-latest": ChatGoogleGenerativeAI,
    "gemini-1.5-pro-latest": ChatGoogleGenerativeAI,
    "gemini-2.5-flash-preview-05-20": ChatGoogleGenerativeAI,
    "gemini-2.5-pro-preview-05-20": ChatGoogleGenerativeAI,
    "gemini-2.5-pro-preview-06-05": ChatGoogleGenerativeAI,
    "gpt-4o": ChatOpenAI,
    "gpt-4o-mini": ChatOpenAI,
    "deepseek-chat": ChatDeepSeek,
    "grok-beta": ChatXAI,
    "grok-3-latest": ChatXAI,
}

# The type hint is generated from the keys of our dictionary
ModelName = Literal[
    "gemini-1.5-flash-latest", "gemini-1.5-pro-latest",
    "gpt-4o", "gpt-4o-mini",
    "deepseek-chat",
    "grok-beta", "grok-3-latest"
]

def get_llm(model_name: ModelName, temperature: float = 0.0) -> BaseChatModel:
    """
    Instantiates and returns a language model based on a direct mapping.
    """
    print(f"--- ⚙️  Instantiating model: {model_name} ---")

    model_class = MODEL_PROVIDER_MAP.get(model_name)

    if model_class is None:
        raise ImportError(
            f"Model '{model_name}' is not available. "
            f"Check if its provider library (e.g., langchain_xai) is installed."
        )

    return model_class(model=model_name, temperature=temperature)

### Build C4 modeler graph

In [20]:
from typing import Literal, Type, List
from langgraph.graph import StateGraph, END
import copy
import functools


def create_c4_modeler_graph(
    checkpointer,
    model_name: str = "gemini-1.5-flash-latest",
    analysis_method: Literal["simple", "collaborative"] = "collaborative",
    collab_rounds: int = 2,
) -> Type[StateGraph]:
    """
    Factory function to build the C4 Modeler workflow.
    """
    print(f"--- 🏗️ Building graph with model: '{model_name}' and analysis: '{analysis_method}' ---")

    llm = get_llm(model_name=model_name)

    workflow = StateGraph(State)

    if analysis_method == "simple":
        bound_analysis_agent_node = functools.partial(analysis_agent_node, llm=llm)
        workflow.add_node("analysis", bound_analysis_agent_node)
    else:
        # --- This is the fully updated collaborative node ---
        def collaborative_analysis_node(state: State) -> dict:
            """
            This node acts as a smart orchestrator. It determines the C4 level,
            selects the correct expert team, and invokes the appropriate subgraph.
            """
            print("--- 🚀 Orchestrating Collaborative Analysis ---")

            # 1. Determine the current C4 level and select the appropriate team
            level = ""
            component_target = None
            active_team: List[Agent] = [] # The team we will use for the subgraph

            if not state["c4_model"].get("context"):
                level = "context"
                active_team = context_level_team # <<< CHANGED
                print("--- Selecting Team: Context Level ---")
            elif not state["c4_model"].get("containers"):
                level = "container"
                active_team = container_level_team # <<< CHANGED
                print("--- Selecting Team: Container Level ---")
            else:
                level = "component"
                active_team = component_level_team # <<< CHANGED
                if state.get("component_queue"):
                    component_target = state["component_queue"][0]
                print(f"--- Selecting Team: Component Level for '{component_target}' ---")

            # 2. Create the specialized subgraph using the selected team
            # <<< CHANGED: Pass the active_team to the factory >>>
            analysis_subgraph = create_collaboration_graph(
                llm=llm,
                team=active_team,
                max_rounds=collab_rounds
            )

            # 3. Prepare the input for the subgraph
            subgraph_level_description = f"component '{component_target}'" if component_target else level
            subgraph_input = {
                "messages": [HumanMessage(content=f"Let's begin the C4 analysis for the {subgraph_level_description}:\n\n{state['system_brief']}")],
                "system_brief": state['system_brief'],
                "context": state['c4_model'].get('context', {}).get('analysis', ''),
                "level": subgraph_level_description,
                "max_rounds": collab_rounds,
                "team": active_team, # <<< CRITICAL: Pass the team into the subgraph's state
            }

            # 4. Invoke the subgraph
            print(f"--- Invoking subgraph for: {subgraph_level_description} ---")
            subgraph_output = analysis_subgraph.invoke(subgraph_input)
            final_analysis = subgraph_output['final_analysis']

            # 5. Update the main graph's state with the result
            print(f"--- ✅ Subgraph complete. Updating main C4 model for: {subgraph_level_description} ---")
            updated_model = copy.deepcopy(state['c4_model'])

            if level == "context":
                updated_model["context"] = {"analysis": final_analysis}
            elif level == "container":
                updated_model["containers"] = {"analysis": final_analysis}
            elif level == "component" and component_target:
                if "components" not in updated_model:
                    updated_model["components"] = {}
                if component_target not in updated_model["components"]:
                    updated_model["components"][component_target] = {}
                updated_model["components"][component_target]["analysis"] = final_analysis
                print(f"--- Successfully updated analysis for component: '{component_target}' ---")

            return {"c4_model": updated_model}

        workflow.add_node("analysis", collaborative_analysis_node)

    # --- The rest of the graph definition remains unchanged ---
    bound_yaml_structure_node = functools.partial(yaml_structure_node, llm=llm)
    bound_plantuml_diagram_node = functools.partial(plantuml_diagram_node, llm=llm)

    workflow.add_node("yaml", bound_yaml_structure_node)
    workflow.add_node("diagram", bound_plantuml_diagram_node)
    workflow.add_node("populate_queue", populate_component_queue_node)
    workflow.add_node("complete_component", complete_component_node)

    workflow.set_entry_point("analysis")
    workflow.add_edge("analysis", "yaml")
    workflow.add_edge("yaml", "diagram")

    workflow.add_conditional_edges("diagram", post_diagram_router, {
        "analysis": "analysis",
        "populate_queue": "populate_queue",
        "complete_component": "complete_component"
    })
    workflow.add_conditional_edges(
        "populate_queue", should_process_components,
        {"process_component": "analysis", "end_workflow": END}
    )
    workflow.add_conditional_edges(
        "complete_component", should_process_components,
        {"process_component": "analysis", "end_workflow": END}
    )

    app = workflow.compile(checkpointer=checkpointer)
    print("✅ LangGraph C4 Modeler compiled successfully with checkpointer!")
    return app

In [21]:
# # Optionally, visualize the graph.
# from IPython.display import Image, display

# display(Image(app.get_graph(xray=True).draw_mermaid_png()))

## Run all experiments

In [22]:
from datetime import datetime
from typing import Dict, List, Any
import re
import uuid

def run_all_experiments(
    app_instance,
    system_briefs_data: Dict[str, str]
) -> List[Dict[str, Any]]:
    """
    Runs the LangGraph C4 model generation for each system brief
    and collects the results.

    Args:
        app_instance: The compiled LangGraph application.
        system_briefs_data (Dict[str, str]): A dictionary where keys are brief names
                                              and values are the system brief content strings.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries, where each dictionary
                              contains the results of a single experiment run,
                              including 'brief_name', 'thread_id', 'system_brief_content',
                              and 'final_c4_model'.
    """
    experiment_results = []
    print("\n--- 🚀 Starting C4 Model Generation Experiments ---")

    for brief_name, system_brief_content in system_briefs_data.items():
        print(f"\n\n{'='*80}")
        print(f"--- Processing: {brief_name} ---")
        print(f"{'='*80}\n")

        initial_state = {
            "messages": [],
            "system_brief": system_brief_content,
            "c4_model": {"context": {}, "containers": {}, "components": {}},
            "component_queue": None,
        }

        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        brief_name_slug = re.sub(r'[^a-zA-Z0-9-]', '', brief_name.replace(" ", "-").lower())
        current_thread_id = f"{timestamp}-{brief_name_slug}-{uuid.uuid4().hex[:8]}"
        config = {"configurable": {"thread_id": current_thread_id}, "recursion_limit": 200}

        print(f"\n--- LangGraph Thread ID: {current_thread_id} ---")

        for event in app_instance.stream(initial_state, config):
            print("\n" + "="*40)
            print(f"Node: {list(event.keys())[0]}")
            print("="*40)

        final_state_snapshot = app_instance.get_state(config)
        final_c4_model = final_state_snapshot.values['c4_model']

        experiment_results.append({
            'brief_name': brief_name,
            'thread_id': current_thread_id,
            'system_brief_content': system_brief_content,
            'final_c4_model': final_c4_model
        })

        print(f"\n--- 🎉 C4 Model Generation Complete for {brief_name}! ---")
        print(f"Final state for thread '{current_thread_id}' retrieved and stored.")

    print("\n\n--- ✅ All C4 Model Generation Experiments Complete! ---")
    return experiment_results

In [23]:
# Run all the experiments to generate C4 models
# all_experiment_raw_results = run_all_experiments(
#     app_instance=app,
#     system_briefs_data=system_briefs
# )

### Saving to zip

In [24]:
# --- RETRIEVE FINAL STATE AND SAVE ARTIFACTS TO FILES ---

import os
import re

def save_c4_artifacts(output_dir, final_c4_model):
    """
    Saves C4 model artifacts (analysis, definitions, diagrams) to the specified output directory.

    Args:
        output_dir (str): The base directory where artifacts will be saved.
        final_c4_model (dict): A dictionary containing the C4 model data,
                               including 'context', 'containers', and 'components' sections.
    """

    def save_artifact(filepath, content):
        """
        Safely saves content to a file.
        """
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"  - Saved {os.path.basename(filepath)}")
        except Exception as e:
            print(f"  - Failed to save {os.path.basename(filepath)}: {e}")

    def sanitize_filename(name):
        """
        Replaces non-alphanumeric characters with underscores for safe filenames.
        """
        return re.sub(r'[^a-zA-Z0-9_-]', '_', name).lower()

    # Save Context level artifacts
    if final_c4_model.get("context"):
        save_artifact(f"{output_dir}/1_context_analysis.md", final_c4_model["context"].get("analysis", ""))
        save_artifact(f"{output_dir}/1_context_definition.yaml", final_c4_model["context"].get("yaml_definition", ""))
        save_artifact(f"{output_dir}/1_context_diagram.puml", final_c4_model["context"].get("diagram", ""))

    # Save Container level artifacts
    if final_c4_model.get("containers"):
        save_artifact(f"{output_dir}/2_container_analysis.md", final_c4_model["containers"].get("analysis", ""))
        save_artifact(f"{output_dir}/2_container_definition.yaml", final_c4_model["containers"].get("yaml_definition", ""))
        save_artifact(f"{output_dir}/2_container_diagram.puml", final_c4_model["containers"].get("diagram", ""))

    # Save Component level artifacts
    if final_c4_model.get("components"):
        component_dir = f"{output_dir}/3_components"
        os.makedirs(component_dir, exist_ok=True)
        for container_name, component_data in final_c4_model["components"].items():
            safe_container_name = sanitize_filename(container_name)
            save_artifact(f"{component_dir}/{safe_container_name}_analysis.md", component_data.get("analysis", ""))
            save_artifact(f"{component_dir}/{safe_container_name}_definition.yaml", component_data.get("yaml_definition", ""))
            save_artifact(f"{component_dir}/{safe_container_name}_diagram.puml", component_data.get("diagram", ""))

In [25]:
# import shutil
# from google.colab import files
# import os

# folder_to_zip = 'c4_artifacts'
# zip_filename = 'c4_model_artifacts.zip'

# print(f"Zipping the folder: '{folder_to_zip}'...")

# try:
#     shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', folder_to_zip)
#     print(f"Successfully created zip file: '{zip_filename}'")
#     print("Starting download... Please wait.")
#     print("Note: If the download doesn't start, check if your browser is blocking pop-ups for this site.")

#     files.download(zip_filename)

# except FileNotFoundError:
#     print(f"Error: The directory '{folder_to_zip}' was not found. Please ensure the previous script ran successfully.")
# except Exception as e:
#     print(f"An error occurred: {e}")

# Evaluation

## Evaluation agents

#### Helper functions

In [26]:
import os
import re
import json
import uuid
import requests
import subprocess
from langchain_openai import ChatOpenAI

# Use a powerful, unbiased model as the judge
# judge_llm = ChatOpenAI(model="gpt-4o", temperature=0)
judge_llm = llm

# Helper to ensure plantuml.jar exists for the automated check
PLANTUML_JAR_URL = "https://github.com/plantuml/plantuml/releases/download/v1.2024.5/plantuml-1.2024.5.jar"
PLANTUML_JAR_PATH = "plantuml.jar"

def setup_plantuml():
    """Downloads plantuml.jar if it doesn't exist."""
    if not os.path.exists(PLANTUML_JAR_PATH):
        print(f"Downloading PlantUML runner from {PLANTUML_JAR_URL}...")
        try:
            response = requests.get(PLANTUML_JAR_URL, stream=True)
            response.raise_for_status()
            with open(PLANTUML_JAR_PATH, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print("PlantUML runner downloaded successfully.")
        except Exception as e:
            print(f"Error downloading PlantUML: {e}")
            return False
    return True

#### Compilation success

In [27]:
import os
import uuid
import subprocess
from typing import Dict, Any

# Assume setup_plantuml() and PLANTUML_JAR_PATH are defined elsewhere

def evaluate_compilation_success(c4_model: dict) -> dict:
    """
    Calculates the percentage of diagrams that compile and captures detailed
    diagnostics for each failure.
    """
    print("🤖 Evaluating Metric: PlantUML Compilation Success...")
    if not setup_plantuml():
        return {"error": "PlantUML runner not available."}

    # --- NEW: Collect diagrams as a list of dicts to track their source ---
    diagram_sources = []
    if c4_model.get("context", {}).get("diagram") is not None:
        diagram_sources.append({"source": "1_Context", "code": c4_model["context"]["diagram"]})
    if c4_model.get("containers", {}).get("diagram") is not None:
        diagram_sources.append({"source": "2_Containers", "code": c4_model["containers"]["diagram"]})
    for name, comp in c4_model.get("components", {}).items():
        if comp.get("diagram") is not None:
            diagram_sources.append({"source": f"3_Component_{name}", "code": comp["diagram"]})

    total_files = len(diagram_sources)
    if total_files == 0:
        return {"score": 0, "successful": 0, "total": 0, "details": []}

    compilation_details = []
    successful_compilations = 0
    temp_filename = f"temp_diagram_{uuid.uuid4()}.puml"

    for diagram in diagram_sources:
        source_name = diagram["source"]
        puml_code = diagram["code"]

        if not puml_code or not puml_code.strip():
            print(f"  - ❌ FAILED: '{source_name}' content is empty.")
            compilation_details.append({"source": source_name, "status": "Failed - Empty", "error": "Diagram content was empty or whitespace."})
            continue

        with open(temp_filename, "w", encoding="utf-8") as f:
            f.write(puml_code)

        try:
            subprocess.run(
                ["java", "-jar", PLANTUML_JAR_PATH, "-failfast2", "-tsvg", temp_filename],
                check=True, capture_output=True, text=True
            )
            successful_compilations += 1
            compilation_details.append({"source": source_name, "status": "Compiled", "error": None})
            print(f"  - ✅ SUCCESS: '{source_name}' compiled.")
        except subprocess.CalledProcessError as e:
            # Capture the actual error message from PlantUML
            error_message = e.stderr.strip()
            compilation_details.append({"source": source_name, "status": "Failed - Syntax Error", "error": error_message})
            print(f"  - ❌ FAILED: '{source_name}' syntax error.")
        except FileNotFoundError:
            # Handle cases where java or the jar file isn't found
            return {"error": "Java or PlantUML JAR not found. Cannot perform compilation check."}
        finally:
            if os.path.exists(temp_filename):
                os.remove(temp_filename)
            svg_filename = temp_filename.replace(".puml", ".svg")
            if os.path.exists(svg_filename):
                os.remove(svg_filename)

    score = (successful_compilations / total_files) * 100 if total_files > 0 else 0
    return {
        "metric": "Compilation Success Rate",
        "score": round(score, 2),
        "successful": successful_compilations,
        "total": total_files,
        "details": compilation_details  # <<< NEW: Rich diagnostic details
    }

In [28]:
# evaluate_compilation_success(all_experiment_raw_results[0]['final_c4_model'])

#### Abstraction adherence

In [29]:
import re
from typing import Dict, Any, Tuple

# ==============================================================================
# 1. DEDICATED & DETAILED RULE CHECKERS
# These now return a tuple: (bool, str) -> (is_valid, reason)
# ==============================================================================

def _check_context_rules(code: str) -> Tuple[bool, str]:
    """Checks rules for a C4 Context Diagram."""
    if re.search(r'Container\s*\(', code, re.IGNORECASE):
        return (False, "Illegal 'Container' element found in a Context diagram.")
    if re.search(r'Component\s*\(', code, re.IGNORECASE):
        return (False, "Illegal 'Component' element found in a Context diagram.")
    # A context diagram should generally define the main system
    if not (re.search(r'System\s*\(', code, re.IGNORECASE) or re.search(r'SystemDb\s*\(', code, re.IGNORECASE) or re.search(r'System_Ext\s*\(', code, re.IGNORECASE)):
        return (False, "Required 'System', 'SystemDb', or 'System_Ext' element appears to be missing.")
    return (True, "Adheres to abstraction level.")

def _check_container_rules(code: str) -> Tuple[bool, str]:
    """Checks rules for a C4 Container Diagram."""
    if re.search(r'Component\s*\(', code, re.IGNORECASE):
        return (False, "Illegal 'Component' element found in a Container diagram.")
    if not re.search(r'Container\s*\(', code, re.IGNORECASE):
        return (False, "Required 'Container' element appears to be missing.")
    if not re.search(r'System_Boundary\s*\(', code, re.IGNORECASE):
        return (False, "Required 'System_Boundary' element appears to be missing.")
    return (True, "Adheres to abstraction level.")

def _check_component_rules(code: str) -> Tuple[bool, str]:
    """Checks rules for a C4 Component Diagram."""
    if not re.search(r'Component\s*\(', code, re.IGNORECASE):
        return (False, "Required 'Component' element appears to be missing.")
    if not re.search(r'Container_Boundary\s*\(', code, re.IGNORECASE):
        return (False, "Required 'Container_Boundary' element appears to be missing.")
    return (True, "Adheres to abstraction level.")


def evaluate_abstraction_adherence(c4_model: dict) -> dict:
    """Checks if each diagram uses PlantUML elements appropriate for its C4 level."""
    print("🤖 Evaluating Metric: C4 Abstraction Adherence...")

    # --- A dispatcher mapping diagram types to their rule checkers ---
    RULE_CHECKERS = {
        'Context': _check_context_rules,
        'Containers': _check_container_rules,
        'Component': _check_component_rules,
    }

    # --- Collect all diagrams to be checked ---
    diagrams_to_check = []
    if c4_model.get("context", {}).get("diagram"):
        diagrams_to_check.append({"type": "Context", "name": "Context", "code": c4_model["context"]["diagram"]})
    if c4_model.get("containers", {}).get("diagram"):
        diagrams_to_check.append({"type": "Containers", "name": "Containers", "code": c4_model["containers"]["diagram"]})
    for name, comp_data in c4_model.get("components", {}).items():
        if comp_data.get("diagram"):
            diagrams_to_check.append({"type": "Component", "name": f"Component: {name}", "code": comp_data["diagram"]})

    total_diagrams = len(diagrams_to_check)
    if total_diagrams == 0:
        return {"score": 0, "details": {}}

    passes = 0
    detailed_results = {}

    for diagram in diagrams_to_check:
        diagram_type = diagram["type"]
        diagram_name = diagram["name"]
        diagram_code = diagram["code"]

        # Find the correct checker function from the dispatcher
        checker_func = RULE_CHECKERS.get(diagram_type)

        if not checker_func:
            # Should not happen with this structure, but it's safe to have
            detailed_results[diagram_name] = {"status": "Unknown", "reason": "No rule checker found for this diagram type."}
            continue

        is_valid, reason = checker_func(diagram_code)

        if is_valid:
            passes += 1

        detailed_results[diagram_name] = {
            "status": "Pass" if is_valid else "Fail",
            "reason": reason
        }

    score = (passes / total_diagrams) * 100 if total_diagrams > 0 else 0
    return {
        "metric": "Abstraction Adherence",
        "score": round(score, 2),
        "details": detailed_results # <<< NEW: Rich diagnostic details
    }

#### Semantic consitency

In [30]:
def evaluate_semantic_consistency(system_brief: str, c4_model: dict, judge_llm) -> dict:
    """Evaluates how well the diagrams capture entities from the input brief."""
    print("⚖️ Evaluating Metric 3: Semantic Consistency...")
    context_diag = c4_model.get("context", {}).get("diagram")
    if not context_diag:
        return {"error": "Context diagram not found."}

    # Step 1: Extract "ground truth" entities from the brief
    extraction_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a requirements analyst. Your task is to extract key entities from a system description. List all people (user roles), external systems, and the main system itself."),
        ("human", "Please extract the entities from the following brief:\n\n{brief}")
    ])
    extraction_chain = extraction_prompt | judge_llm | StrOutputParser()
    extracted_items_str = extraction_chain.invoke({"brief": system_brief})

    # Simple parsing of the extracted items
    extracted_items = [item.strip() for item in extracted_items_str.split('\n') if item.strip()]

    # Step 2: Verify the extracted items against the diagram
    verification_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a meticulous verifier. For each item in the checklist, check if it is clearly represented in the provided PlantUML diagram. Respond with only 'YES' or 'NO' for each item."),
        ("human", """**Checklist:**
                   {checklist}

                   **PlantUML Diagram:**
                   ```puml
                   {diagram}
                   ```""")
    ])
    verification_chain = verification_prompt | judge_llm | StrOutputParser()
    verification_results_str = verification_chain.invoke({
        "checklist": "\n".join(f"- {item}" for item in extracted_items),
        "diagram": context_diag
    })

    verified_count = verification_results_str.upper().count("YES")
    total_items = len(extracted_items)
    score = (verified_count / total_items) * 100 if total_items > 0 else 0

    return {
        "metric": "Semantic Consistency",
        "score": round(score, 2),
        "verified_items": verified_count,
        "total_items": total_items
    }

In [31]:
import yaml
import re
from typing import Dict, Any, List

def evaluate_definitional_consistency(
    yaml_definition_str: str,
    diagram_code_str: str,
    element_type: str # e.g., 'container' or 'component'
) -> Dict[str, Any]:
    """
    Checks if all elements defined in a YAML spec are present in a PlantUML diagram.
    This is used to ensure consistency between a definition and its visualization.

    Args:
        yaml_definition_str: The string content of the YAML definition file.
        diagram_code_str: The string content of the PlantUML diagram file.
        element_type: The key to look for in the YAML file (e.g., 'containers' or 'components').
    """
    if not yaml_definition_str or not diagram_code_str:
        return {"score": 0, "details": {"error": f"Missing YAML definition or diagram for {element_type}."}}

    try:
        # 1. Parse the YAML to get the "ground truth" list of names
        definition_data = yaml.safe_load(yaml_definition_str)
        # The structure might be {'system': {'containers': [...]}} or just {'containers': [...]}
        # We'll navigate down to find the list of elements.
        elements = definition_data.get(element_type) or definition_data.get('system', {}).get(element_type, [])

        if not isinstance(elements, list):
             return {"score": 0, "details": {"error": f"Could not find a list of '{element_type}' in the YAML."}}

        defined_names = [item['name'] for item in elements if 'name' in item]

        if not defined_names:
            return {"score": 100, "details": {"message": f"No {element_type}s with a 'name' key found in the YAML definition."}}

        # 2. Verify each name is present in the diagram
        found_count = 0
        verification_details = []
        for name in defined_names:
            # Check if the name appears within a PlantUML element definition, e.g., Container(name, ...)
            # This regex is a simple but effective check.
            if re.search(fr'\(\s*"{re.escape(name)}"\s*,', diagram_code_str) or re.search(fr'\(\s*{re.escape(name)}\s*,', diagram_code_str):
                found_count += 1
                verification_details.append({"element_name": name, "status": "Found"})
            else:
                verification_details.append({"element_name": name, "status": "Missing"})

        total_defined = len(defined_names)
        score = (found_count / total_defined) * 100 if total_defined > 0 else 100

        return {
            "metric": f"{element_type.capitalize()} Definitional Consistency",
            "score": round(score, 2),
            "found_count": found_count,
            "total_defined": total_defined,
            "details": verification_details
        }

    except Exception as e:
        return {"error": f"Failed to process definitional consistency check: {e}"}

#### Qualitative rubric

In [32]:
from langchain_core.prompts import ChatPromptTemplate

def evaluate_qualitative_rubric(
    diagram_code: str,
    diagram_name: str,
    system_brief: str, # <<< NEW: The original system requirements
    judge_llm
) -> dict:
    """
    Scores a single diagram based on a qualitative rubric using an LLM-as-a-Judge,
    providing the judge with the necessary system brief for context.
    """
    print(f"⚖️ Evaluating Metric: Qualitative Rubric for {diagram_name}...")

    # --- 1. JSON Schema is unchanged ---
    rubric_schema = {
        "title": "QualitativeRubricEvaluation",
        # ... (schema remains identical)
    }

    # --- 2. Update the prompt to include the system_brief ---
    rubric_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert software architect acting as a judge. Your task is to evaluate the provided C4 diagram against the requirements of the original system brief. Provide a score from 1 (poor) to 5 (excellent) for each criterion, along with a brief justification. You must format your response as a JSON object that adheres to the provided schema."),
        # <<< CHANGED: The human message now includes the system brief >>>
        ("human", """
        **System Brief to Reference:**
        ---
        {system_brief}
        ---

        **Rubric:**
        | Criterion         | 1 (Poor)                                                    | 3 (Average)                                                    | 5 (Excellent)                                                                      |
        |-------------------|-------------------------------------------------------------|----------------------------------------------------------------|------------------------------------------------------------------------------------|
        | **Completeness** | The diagram is missing core elements from the system brief. | The diagram includes most major elements but misses minor details. | All key entities and relationships required by the system brief are present.       |
        | **Correctness** | Relationships are illogical or incorrectly depicted.        | Relationships are mostly correct but have minor inaccuracies.    | All relationships are logical, well-defined, and correctly directed.               |
        | **Plausibility** | The architecture is unrealistic for the system's goals.     | The architecture is plausible but might be suboptimal.           | The architecture is highly plausible and follows common industry patterns.         |
        | **Clarity & Naming**| Element names are vague, inconsistent, or use jargon poorly.| Names are understandable but could be clearer.                 | Names are clear, concise, and follow standard naming conventions.                  |

        **Diagram to Evaluate:**
        ```puml
        {diagram}
        ```

        Please provide your JSON response now.
        """)
    ])

    # --- 3. Bind the schema to the LLM (unchanged) ---
    structured_judge_llm = judge_llm.with_structured_output(rubric_schema)
    rubric_chain = rubric_prompt | structured_judge_llm

    # --- 4. Invoke the chain with the new variable ---
    try:
        # <<< CHANGED: Pass the system_brief into the invoke call >>>
        results = rubric_chain.invoke({
            "diagram": diagram_code,
            "system_brief": system_brief
        })
        scores = [v['score'] for v in results.values()]
        average_score = sum(scores) / len(scores) if scores else 0
        return {
            "diagram_name": diagram_name,
            "average_score": round(average_score, 2),
            "details": results
        }
    except Exception as e:
        return {"error": f"Failed to get rubric score: {e}"}

#### Cross level consistency

In [33]:
import yaml # Requires PyYAML: pip install pyyaml
from typing import Dict, List, Set, Any

# --- Helper Functions ---

def parse_yaml_safe(yaml_string: str) -> Dict:
    """Safely parses a YAML string, returning an empty dict on error."""
    if not yaml_string:
        return {}
    try:
        return yaml.safe_load(yaml_string) or {}
    except yaml.YAMLError:
        return {}

def extract_element_names(parsed_yaml: Dict, element_types: List[str]) -> Set[str]:
    """Extracts a set of names for given element types from parsed YAML data."""
    names = set()
    for element in parsed_yaml.get("elements", []):
        if element.get("type") in element_types:
            names.add(element.get("name"))
    return names

# --- Main Evaluation Function ---

def _check_context_to_container(context_data: Dict, container_data: Dict) -> Tuple[bool, str]:
    """Performs a two-way consistency check between Context and Container levels."""
    context_externals = extract_element_names(context_data, ["person", "externalSystem"])
    container_externals = extract_element_names(container_data, ["person", "externalSystem"])

    # Check 1: Elements illegally added to the container diagram
    added_elements = container_externals - context_externals
    # Check 2: Elements incorrectly missing from the container diagram
    missing_elements = context_externals - container_externals

    errors = []
    if added_elements:
        errors.append(f"Illegally added elements not found in Context: {list(added_elements)}")
    if missing_elements:
        errors.append(f"Elements from Context missing from diagram: {list(missing_elements)}")

    if not errors:
        return (True, "External elements are consistent with the Context level.")
    else:
        return (False, " ".join(errors))

def _check_container_to_components(container_data: Dict, component_data_map: Dict) -> Dict:
    """Checks each component diagram for consistency with the container level."""
    check_results = {}
    container_known_elements = extract_element_names(container_data, ["container", "person", "externalSystem", "database"])

    for comp_name, comp_data in component_data_map.items():
        if not comp_data:
            continue

        # A component can reference other containers, people, or external systems.
        comp_referenced_externals = extract_element_names(comp_data, ["container", "person", "externalSystem", "database", "component"])

        # The logic here is correct: a component cannot reference something unknown at the container level.
        mismatched_elements = comp_referenced_externals - container_known_elements

        key = f"Container->Component ({comp_name})"
        if not mismatched_elements:
            check_results[key] = {"status": "Pass", "reason": "All references are consistent with the Container level."}
        else:
            check_results[key] = {
                "status": "Fail",
                "reason": f"References elements not found in Container scope: {list(mismatched_elements)}"
            }
    return check_results

# --- Main Evaluation Function (Now much cleaner) ---

def evaluate_cross_level_consistency(c4_model: Dict) -> Dict[str, Any]:
    """Measures two-way consistency of elements across C4 levels."""
    print("🤖 Evaluating Metric: Cross-Level Consistency Check...")

    # 1. Parse all relevant YAML definitions
    context_data = parse_yaml_safe(c4_model.get("context", {}).get("yaml_definition"))
    container_data = parse_yaml_safe(c4_model.get("containers", {}).get("yaml_definition"))
    component_data_map = {
        name: parse_yaml_safe(data.get("yaml_definition", ""))
        for name, data in c4_model.get("components", {}).items()
    }

    all_details = {}

    # 2. Run Context -> Container Check
    if context_data and container_data:
        is_consistent, reason = _check_context_to_container(context_data, container_data)
        all_details["Context->Container"] = {"status": "Pass" if is_consistent else "Fail", "reason": reason}

    # 3. Run Container -> Component Checks
    if container_data and component_data_map:
        component_results = _check_container_to_components(container_data, component_data_map)
        all_details.update(component_results)

    # 4. Calculate Final Score
    total_checks = len(all_details)
    passed_checks = sum(1 for result in all_details.values() if result["status"] == "Pass")
    score = (passed_checks / total_checks) * 100 if total_checks > 0 else 100

    return {
        "metric": "Cross-Level Consistency",
        "score": round(score, 2),
        "passed": passed_checks,
        "total": total_checks,
        "details": all_details
    }

# --- Example Usage ---
# Assuming 'final_c4_model' is the dictionary containing all generated artifacts

# consistency_report = evaluate_cross_level_consistency(final_c4_model)
# import json
# print(json.dumps(consistency_report, indent=2))|

#### Emergent naming consitency

In [34]:
import re
import yaml
from collections import Counter
from typing import Dict, List, Any, Set

# --- Helper Function (reused) ---

def parse_yaml_safe(yaml_string: str) -> Dict:
    """Safely parses a YAML string, returning an empty dict on error."""
    if not yaml_string:
        return {}
    try:
        return yaml.safe_load(yaml_string) or {}
    except yaml.YAMLError:
        return {}

# --- Main Evaluation Function (New Logic) ---

def evaluate_emergent_naming_consistency(c4_model: Dict) -> Dict[str, Any]:
    """
    Measures the internal consistency of naming conventions by first detecting
    the dominant convention and then identifying outliers.
    """
    print("🤖 Evaluating Metric 6 (New): Emergent Naming Consistency...")

    # 1. Define known patterns to classify names
    PATTERNS = {
        "PascalCase": r'^(?:[A-Z][a-z0-9]+)+$',
        "camelCase": r'^[a-z]+(?:[A-Z][a-z0-9]+)*$',
        "snake_case": r'^[a-z0-9]+(?:_[a-z0-9]+)*$',
        "kebab-case": r'^[a-z0-9]+(?:-[a-z0-9]+)*$'
    }

    def classify_name(name: str) -> str:
        """Classifies a name into a known convention or 'other'."""
        if not name:
            return "missing"
        for convention, pattern in PATTERNS.items():
            if re.match(pattern, name):
                return convention
        return "other"

    # 2. Collect all relevant elements to check (same logic as before)
    elements_to_check = []
    context_data = parse_yaml_safe(c4_model.get("context", {}).get("yaml_definition"))
    if context_data.get("system"):
        elements_to_check.append({"type": "system", "name": context_data["system"].get("name", "")})

    container_data = parse_yaml_safe(c4_model.get("containers", {}).get("yaml_definition"))
    for element in container_data.get("elements", []):
        if element.get("type") == "container":
            elements_to_check.append({"type": "container", "name": element.get("name", "")})

    component_yamls = {n: d.get("yaml_definition") for n, d in c4_model.get("components", {}).items()}
    for comp_name, yaml_str in component_yamls.items():
        comp_data = parse_yaml_safe(yaml_str)
        for element in comp_data.get("elements", []):
            if element.get("type") == "component":
                elements_to_check.append({"type": f"component (in {comp_name})", "name": element.get("name", "")})

    # 3. Handle case with no elements
    total_elements = len(elements_to_check)
    if total_elements == 0:
        return {"metric": "Emergent Naming Consistency", "score": 100.0, "details": "No elements found to evaluate."}

    # 4. Classify all names and find the dominant convention
    classifications = [classify_name(elem.get("name")) for elem in elements_to_check]
    convention_counts = Counter(classifications)

    dominant_convention = "none"
    if convention_counts:
        # Find the most common convention, ignoring 'other' or 'missing' if possible
        top_conventions = [item for item in convention_counts.most_common() if item[0] not in ["other", "missing"]]
        if top_conventions:
            dominant_convention = top_conventions[0][0]
        else: # Handle case where all names are 'other' or 'missing'
            dominant_convention = convention_counts.most_common(1)[0][0]

    # 5. Identify outliers and calculate consistency score
    outliers = []
    consistent_elements = 0
    for i, element in enumerate(elements_to_check):
        if classifications[i] == dominant_convention:
            consistent_elements += 1
        else:
            outliers.append({
                "name": element.get("name", "[MISSING NAME]"),
                "type": element["type"],
                "detected_convention": classifications[i],
                "reason": f"Deviates from the dominant convention: '{dominant_convention}'"
            })

    # 6. Calculate Final Score
    score = (consistent_elements / total_elements) * 100

    return {
        "metric": "Emergent Naming Consistency",
        "score": round(score, 2),
        "dominantConvention": {
            "name": dominant_convention,
            "count": convention_counts.get(dominant_convention, 0),
            "total": total_elements
        },
        "allConventionCounts": dict(convention_counts),
        "details": {"outliers": outliers} if outliers else "All names are internally consistent."
    }

#### Architect critique

In [36]:
import json
from typing import Dict, Any, List


def evaluate_architect_critique(system_brief: str, c4_model: Dict, judge_llm) -> Dict[str, Any]:
    """
    Generates a structured, qualitative critique of the entire C4 model from the
    perspective of a Principal Software Architect.
    """
    print("⚖️ Evaluating Metric 7: Principal Architect's Critique...")

    # --- 1. Define the JSON Schema for a structured, comparable output ---
    architect_critique_schema = {
        "title": "PrincipalArchitectCritique",
        "description": "A senior-level review of a software architecture, providing both quantitative ratings and qualitative, narrative feedback.",
        "type": "object",
        "properties": {
            "executiveSummary": {
                "type": "string",
                "description": "A high-level, one-paragraph summary of the architect's overall opinion of the design."
            },
            "feasibilityAndSoundness": {
                "type": "object",
                "description": "Evaluation of the architecture's technical viability and alignment with non-functional requirements.",
                "properties": {
                    "rating": {"type": "integer", "description": "Overall rating for this category from 1 (poor) to 5 (excellent)."},
                    "critique": {"type": "string", "description": "Detailed narrative explaining the rating, covering technology choices, scalability, and performance."},
                    "identifiedRisks": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "A specific list of the top 2-3 architectural risks or potential bottlenecks."
                    }
                },
                "required": ["rating", "critique", "identifiedRisks"]
            },
            "clarityAndCommunication": {
                "type": "object",
                "description": "Evaluation of how well the diagrams communicate the architecture to different audiences.",
                "properties": {
                    "rating": {"type": "integer", "description": "Overall rating for this category from 1 (poor) to 5 (excellent)."},
                    "critique": {"type": "string", "description": "Detailed narrative explaining the rating, commenting on the clarity of each diagram for its intended audience."}
                },
                "required": ["rating", "critique"]
            },
            "actionableRecommendation": {
                "type": "object",
                "description": "The single most important recommendation for improving the design.",
                "properties": {
                    "recommendation": {"type": "string", "description": "A clear, concise statement of the primary recommendation."},
                    "justification": {"type": "string", "description": "The reasoning behind why this recommendation is critical."},
                    "priority": {
                        "type": "string",
                        "enum": ["Critical", "High", "Medium"],
                        "description": "The priority level of this recommendation."
                    }
                },
                "required": ["recommendation", "justification", "priority"]
            }
        },
        "required": ["executiveSummary", "feasibilityAndSoundness", "clarityAndCommunication", "actionableRecommendation"]
    }

    # --- 2. Aggregate ALL relevant input documents for the judge ---
    # <<< CHANGED: Now includes both YAML definitions and PlantUML diagrams >>>
    architecture_documents = []

    # Context Level
    architecture_documents.append("## Context Level Definition (YAML)\n```yaml\n" + c4_model.get("context", {}).get("yaml_definition", "Not available") + "\n```")
    architecture_documents.append("\n## Context Level Diagram (PlantUML)\n```puml\n" + c4_model.get("context", {}).get("diagram", "Not available") + "\n```")

    # Container Level
    architecture_documents.append("\n\n## Container Level Definition (YAML)\n```yaml\n" + c4_model.get("containers", {}).get("yaml_definition", "Not available") + "\n```")
    architecture_documents.append("\n## Container Level Diagram (PlantUML)\n```puml\n" + c4_model.get("containers", {}).get("diagram", "Not available") + "\n```")

    # Component Level (include up to 2 for brevity)
    component_count = 0
    for name, data in c4_model.get("components", {}).items():
        if component_count < 2:
            architecture_documents.append(f"\n\n## Component Level: {name} (YAML)\n```yaml\n" + data.get("yaml_definition", "Not available") + "\n```")
            architecture_documents.append(f"\n## Component Level: {name} (PlantUML)\n```puml\n" + data.get("diagram", "Not available") + "\n```")
            component_count += 1

    full_context = "\n".join(architecture_documents)

    # --- 3. Update the prompt to reflect the new context ---
    critique_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a pragmatic Principal Software Architect... (persona is unchanged)"""),
        # <<< CHANGED: Prompt now acknowledges both definitions and diagrams >>>
        ("human", """Please review the following architecture.

        **Guiding Questions for Your Analysis:**
        1.  **Feasibility & Soundness:** Based on the brief and the YAML definitions, are the technology choices realistic? Does the decomposition make sense for scalability and performance? Identify the biggest architectural risk.
        2.  **Clarity & Communication:** Does this set of diagrams AND definitions effectively communicate the architecture? Is there a clear link between the definitions and the diagrams?
        3.  **Actionable Recommendation:** What is the single most important change you would recommend to this design and why?

        **System Design Brief:**
        ```
        {brief}
        ```

        **Generated C4 Architecture (Definitions & Diagrams):**
        {architecture_docs}

        Provide your structured JSON response now.
        """)
    ])

    structured_judge_llm = judge_llm.with_structured_output(architect_critique_schema)
    critique_chain = critique_prompt | structured_judge_llm

    # --- 4. Invoke the chain with the enriched context ---
    try:
        critique = critique_chain.invoke({
            "brief": system_brief,
            "architecture_docs": full_context
        })
        return {
            "metric": "Principal Architect's Critique",
            "critique": critique
        }
    except Exception as e:
        return {"error": f"Failed to get architect's critique: {e}"}

#### Security assessment

In [37]:
import json
from typing import Dict, Any, List


def evaluate_security_assessment(system_brief: str, c4_model: Dict, judge_llm) -> Dict[str, Any]:
    """
    Performs a threat modeling assessment on the container diagram from the
    perspective of a cybersecurity expert.
    """
    print("🛡️  Evaluating Metric 8: Security 'Red Team' Assessment...")

    container_diag = c4_model.get("containers", {}).get("diagram")
    if not container_diag:
        return {"error": "Container diagram not found, cannot perform security assessment."}

    # --- 1. Define the JSON Schema for a structured vulnerability report ---
    security_assessment_schema = {
        "title": "SecurityThreatModel",
        "description": "A threat model report identifying potential security vulnerabilities in a software architecture.",
        "type": "object",
        "properties": {
            "executiveSummary": {
                "type": "string",
                "description": "A high-level summary of the overall security posture of the architecture."
            },
            "vulnerabilities": {
                "type": "array",
                "description": "A list of identified potential vulnerabilities.",
                "items": {
                    "type": "object",
                    "properties": {
                        "description": {
                            "type": "string",
                            "description": "A clear, concise description of the potential vulnerability."
                        },
                        "category": {
                            "type": "string",
                            "description": "The type of vulnerability.",
                            "enum": ["Information Disclosure", "Insecure Data Flow", "Authentication Bypass", "Elevation of Privilege", "Denial of Service", "Missing Security Control"]
                        },
                        "severity": {
                            "type": "string",
                            "description": "The estimated severity of the vulnerability.",
                            "enum": ["Critical", "High", "Medium", "Low"]
                        },
                        "recommendation": {
                            "type": "string",
                            "description": "An actionable recommendation to mitigate the risk."
                        }
                    },
                    "required": ["description", "category", "severity", "recommendation"]
                }
            }
        },
        "required": ["executiveSummary", "vulnerabilities"]
    }

    # --- 2. Create the prompt and the structured LLM chain ---
    security_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a cybersecurity expert specializing in threat modeling and architectural security reviews. Your call sign is 'Red Specter'. Your job is to think like an attacker and identify potential weaknesses in the proposed design.

Your analysis should be based on the provided system brief and C4 Container diagram. You must format your entire response as a single JSON object that strictly adheres to the provided schema. Do not add any text outside the JSON object."""),
        ("human", """Please perform a security review of the following architecture.

        **Guiding Questions for Your Analysis:**
        1.  **Attack Surface Analysis:** Based on the diagram, what are the primary entry points for an external attacker? Which containers are most exposed?
        2.  **Data Flow Risks:** Where is sensitive patron data likely to be stored or processed? Are there any risky relationships shown, such as a public-facing container having direct access to the main database?
        3.  **Missing Controls:** What critical security components or considerations (e.g., an API Gateway, a dedicated authentication service, firewalls, rate limiting) appear to be missing from this architecture?

        **System Design Brief:**
        ```yaml
        {brief}
        ```

        **C4 Container Diagram:**
        ```puml
        {diagram}
        ```

        Provide your structured JSON threat model now.
        """)
    ])

    structured_judge_llm = judge_llm.with_structured_output(security_assessment_schema)
    security_chain = security_prompt | structured_judge_llm

    # --- 3. Invoke the chain and process results ---
    try:
        assessment = security_chain.invoke({
            "brief": system_brief,
            "diagram": container_diag
        })

        # --- 4. Post-process to calculate a comparable risk score ---
        # Lower score is better.
        risk_weights = {"Critical": 10, "High": 5, "Medium": 2, "Low": 1}
        total_risk_score = 0
        for vuln in assessment.get("vulnerabilities", []):
            total_risk_score += risk_weights.get(vuln.get("severity"), 0)

        assessment['overallRiskScore'] = total_risk_score

        return {
            "metric": "Security 'Red Team' Assessment",
            "assessment": assessment
        }
    except Exception as e:
        return {"error": f"Failed to get security assessment: {e}"}

# --- Example Usage ---
# Assuming 'final_c4_model' and 'system_brief_lms' exist from previous steps

# security_report = evaluate_security_assessment(system_brief_lms, final_c4_model)
# import json
# print(json.dumps(security_report, indent=2))

#### C4 completeness

In [38]:
from typing import Dict, Any, Optional
import yaml # Assuming usage from previous context

def check_c4_completeness(c4_model: Dict[str, Any]) -> Dict[str, Any]:
    """
    Checks for missing or empty C4 artifacts, respecting the sequential
    dependency between C4 levels.
    """
    print("🤖 Evaluating Metric: C4 Model Completeness...")

    def is_missing(value: Optional[str]) -> bool:
        """Returns True if value is None, an empty string, or whitespace-only."""
        return not value or not value.strip()

    results = {}
    missing_count = 0
    total_expected = 0
    artifacts = ["analysis", "yaml_definition", "diagram"]

    # --- 1. CONTEXT LEVEL CHECK ---
    # The context level is always expected.
    total_expected += 3
    context_data = c4_model.get("context", {})
    context_status = {key: "Present" if not is_missing(context_data.get(key)) else "Missing" for key in artifacts}
    missing_count += list(context_status.values()).count("Missing")
    results["Context"] = context_status

    # --- 2. CONTAINER LEVEL CHECK (Conditional) ---
    # <<< REFINED LOGIC: Only expect container artifacts if the context was generated successfully. >>>
    # We use the context's YAML definition as the key indicator of success.
    if not is_missing(context_data.get("yaml_definition")):
        total_expected += 3
        container_data = c4_model.get("containers", {})
        container_status = {key: "Present" if not is_missing(container_data.get(key)) else "Missing" for key in artifacts}
        missing_count += list(container_status.values()).count("Missing")
        results["Containers"] = container_status
    else:
        # If context failed, we explicitly state that containers were not expected.
        results["Containers"] = {key: "Not Expected" for key in artifacts}

    # --- 3. COMPONENT LEVEL CHECK (Logic is already correct) ---
    results["Components"] = {}
    components_data = c4_model.get("components", {})
    if components_data:
        for comp_name, comp_data in components_data.items():
            comp_status = {}
            total_expected += 1 # Analysis is always expected if the component key exists.
            if is_missing(comp_data.get("analysis")):
                comp_status["analysis"] = "Missing"
                missing_count += 1
                comp_status["yaml_definition"] = "Not Expected"
                comp_status["diagram"] = "Not Expected"
            else:
                comp_status["analysis"] = "Present"
                total_expected += 2 # YAML and diagram are now expected.
                for key in ["yaml_definition", "diagram"]:
                    if is_missing(comp_data.get(key)):
                        comp_status[key] = "Missing"
                        missing_count += 1
                    else:
                        comp_status[key] = "Present"
            results["Components"][comp_name] = comp_status

    # --- 4. Final Score Calculation ---
    score = 100.0
    if total_expected > 0:
        score = ((total_expected - missing_count) / total_expected) * 100

    return {
        "metric": "C4 Model Completeness",
        "score": round(score, 2),
        "missing_count": missing_count,
        "total_expected_artifacts": total_expected,
        "details": results
    }

## Run evaluators

### Run full evaluation

In [39]:
import json
from datetime import datetime
from typing import Dict, Any

# Assume all other functions (get_llm, evaluate_*, check_c4_completeness) are defined
# and that ModelName is a defined type.

def run_full_evaluation(
    system_brief: str,
    c4_model: Dict,
    judge_model_name: Any, # Using Any to be compatible with ModelName type
    temperature: float = 0.0
) -> Dict[str, Any]:
    """
    Runs a structured, level-aware evaluation of a C4 model, providing the
    correct context and source of truth to each metric.
    """
    print("\n" + "="*50)
    print(f"🏁 STARTING FULL C4 MODEL EVALUATION (Judge: {judge_model_name}) 🏁")
    print("="*50 + "\n")

    judge_llm = get_llm(model_name=judge_model_name, temperature=temperature)
    report = {
        'evaluationMetadata': {
            "judgeModel": judge_model_name,
            "judgeModelTemperature": temperature,
            "evaluationTimestamp": datetime.now().isoformat()
        }
    }

    # --- Layer 1: Holistic Structural & Completeness Metrics ---
    # These metrics correctly inspect the entire c4_model dictionary at once.
    print("--- Running Holistic Structural Checks ---")
    report['compilationSuccess'] = evaluate_compilation_success(c4_model)
    report['abstractionAdherence'] = evaluate_abstraction_adherence(c4_model)
    report['missingInformation'] = check_c4_completeness(c4_model)
    # Naming consistency can also be considered a holistic check
    report['emergentNamingConsistency'] = evaluate_emergent_naming_consistency(c4_model)

    # --- Layer 2: Level-Specific Semantic & Qualitative Evaluations ---
    print("\n--- Running Level-Specific Evaluations ---")

    # --- CONTEXT LEVEL EVALUATION ---
    if "context" in c4_model:
        print("  - Evaluating Context Level...")
        context_eval = {}
        context_diag = c4_model["context"].get("diagram")
        if context_diag:
            # The brief IS the source of truth for the context diagram.
            context_eval['semanticConsistency'] = evaluate_semantic_consistency(system_brief, c4_model, judge_llm)
            # The qualitative rubric now gets the brief it needs to judge completeness.
            context_eval['qualitativeRubric'] = evaluate_qualitative_rubric(context_diag, "Context Diagram", system_brief, judge_llm)
        report['contextEvaluation'] = context_eval

    # --- CONTAINER LEVEL EVALUATION ---
    if "containers" in c4_model:
        print("  - Evaluating Container Level...")
        container_eval = {}
        container_diag = c4_model["containers"].get("diagram")
        container_yaml = c4_model["containers"].get("yaml_definition")
        if container_diag and container_yaml:
            # The container YAML is the source of truth for the container diagram.
            container_eval['definitionalConsistency'] = evaluate_definitional_consistency(container_yaml, container_diag, "containers")
            # The qualitative rubric still needs the high-level brief for context.
            container_eval['qualitativeRubric'] = evaluate_qualitative_rubric(container_diag, "Container Diagram", system_brief, judge_llm)
        report['containerEvaluation'] = container_eval

    # --- COMPONENT LEVEL EVALUATION ---
    if "components" in c4_model:
        print("  - Evaluating Component Level(s)...")
        component_evals = {}
        for comp_name, comp_data in c4_model["components"].items():
            comp_diag = comp_data.get("diagram")
            comp_yaml = comp_data.get("yaml_definition")
            if comp_diag and comp_yaml:
                # The component YAML is the source of truth for the component diagram.
                consistency_result = evaluate_definitional_consistency(comp_yaml, comp_diag, "components")
                # For component rubric, the system brief is still the best high-level context we have.
                rubric_result = evaluate_qualitative_rubric(comp_diag, f"Component: {comp_name}", system_brief, judge_llm)
                component_evals[comp_name] = {
                    'definitionalConsistency': consistency_result,
                    'qualitativeRubric': rubric_result
                }
        report['componentEvaluations'] = component_evals

    # --- Layer 3: Holistic Expert Critiques ---
    # These high-level critiques look at the model as a whole, so they run last.
    print("\n--- Running Holistic Expert Critiques ---")
    report['architectCritique'] = evaluate_architect_critique(system_brief, c4_model, judge_llm)
    report['securityAssessment'] = evaluate_security_assessment(system_brief, c4_model, judge_llm)

    print("\n\n" + "="*50)
    print(f"📋 FINAL EVALUATION REPORT (Judge: {judge_model_name}) 📋")
    print("="*50 + "\n")
    print(json.dumps(report, indent=2))

    return report

### Save evalution report

In [40]:
import os
import json
from typing import Dict, Any, List

# Assume the following are defined in your broader script:
# - run_full_evaluation (your existing evaluation function)
# - experiment_results (the list containing results from each run)

def save_all_evaluation_reports(
    all_reports: Dict[str, Dict[str, Any]],
    output_filename: str = "all_evaluation_summary.json",
    output_dir: str = "c4_artifacts/evaluation_summaries"
) -> None:
    """
    Consolidates and saves all individual evaluation reports into a single JSON file.

    Args:
        all_reports (Dict[str, Dict[str, Any]]): A dictionary where keys are thread_ids
                                                  and values are the full evaluation reports.
        output_filename (str): The name of the file to save the consolidated report.
        output_dir (str): The directory where the consolidated report will be saved.
    """
    os.makedirs(output_dir, exist_ok=True)
    filepath = os.path.join(output_dir, output_filename)

    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(all_reports, f, indent=2)
        print(f"\n--- ✅ Consolidated evaluation reports saved to: {filepath} ---")
    except Exception as e:
        print(f"\n--- ❌ Failed to save consolidated evaluation reports to {filepath}: {e} ---")


### Format evaluation reports

In [41]:
import json
from datetime import datetime
from typing import Dict, Any

# ==============================================================================
# 1. DEDICATED FORMATTING HELPER FUNCTIONS
# Each function is small, self-contained, and easy to understand/modify.
# ==============================================================================

def _format_default(data: Dict[str, Any]) -> str:
    """Default formatter for any metric that doesn't have a custom one."""
    title = data.get('metric', 'Metric Details')
    return (
        f"### {title}\n\n"
        "**Details:**\n"
        "```json\n"
        f"{json.dumps(data.get('details', data), indent=2)}\n"
        "```\n\n"
    )

def _format_compilation_success(data: Dict[str, Any]) -> str:
    """Formats the 'Compilation Success' metric."""
    score = data.get('score', 0)
    status_text = "Excellent" if score >= 80 else ("Good" if score >= 50 else "Needs Improvement")
    successful = data.get('successful', 0)
    total = data.get('total', 0)
    return (
        f"### {data.get('metric', 'Compilation Success')}\n\n"
        f"**Overall Score:** {score:.2f}% ({status_text})\n"
        f"- **Successful:** {successful} / **Total:** {total}\n\n"
    )

def _format_abstraction_adherence(data: Dict[str, Any]) -> str:
    """Formats the 'Abstraction Adherence' metric."""
    details = data.get('details', {})
    lines = [f"### {data.get('metric', 'Abstraction Adherence')}\n\n"]
    if not details:
        lines.append("No details provided.\n\n")
    for diag, status in details.items():
        lines.append(f"- {diag}: **{status}**\n")
    return "".join(lines) + "\n"

def _format_cross_level_consistency(data: Dict[str, Any]) -> str:
    """Formats the 'Cross-Level Consistency' metric."""
    details = data.get('details', {})
    lines = [f"### {data.get('metric', 'Cross-Level Consistency')}\n\n"]
    lines.append(f"- **Passed Checks:** {data.get('passed', 0)} / **Total Checks:** {data.get('total', 0)}\n\n")
    if isinstance(details, dict):
        for check, result in details.items():
            status = result.get('status', 'Unknown')
            reason = result.get('reason', '')
            lines.append(f"- {check}: **{status}**{f' - *{reason}*' if reason else ''}\n")
    return "".join(lines) + "\n"

def _format_security_assessment(data: Dict[str, Any]) -> str:
    """Formats the 'Security Red Team Assessment' metric."""
    assessment = data.get('assessment', {})
    lines = [f"### {data.get('metric', 'Security Assessment')}\n\n"]
    lines.append(f"**Executive Summary:** {assessment.get('executiveSummary', 'N/A')}\n")
    lines.append(f"**Overall Risk Score (Lower is better):** {assessment.get('overallRiskScore', 'N/A')}\n\n")

    vulnerabilities = assessment.get('vulnerabilities', [])
    if vulnerabilities:
        lines.append("**Identified Vulnerabilities:**\n\n")
        lines.append("| Description | Category | Severity | Recommendation |\n")
        lines.append("|---|---|---|---|\n")
        for vuln in vulnerabilities:
            lines.append(f"| {vuln.get('description', 'N/A')} | {vuln.get('category', 'N/A')} | **{vuln.get('severity', 'Low')}** | {vuln.get('recommendation', 'N/A')} |\n")
    else:
        lines.append("No specific vulnerabilities were identified.\n")
    return "".join(lines) + "\n"

def _format_architect_critique(data: Dict[str, Any]) -> str:
    """Formats the 'Principal Architect's Critique'."""
    critique = data.get('critique', {})
    lines = [f"### {data.get('metric', 'Architect’s Critique')}\n\n"]
    lines.append(f"**Executive Summary:** {critique.get('executiveSummary', 'N/A')}\n\n")
    # Feasibility and Soundness
    fs = critique.get('feasibilityAndSoundness', {})
    lines.append(f"**Feasibility & Soundness ({fs.get('rating', '-')}/5):** {fs.get('critique', 'N/A')}\n")
    if fs.get('identifiedRisks'):
        lines.append("**Identified Risks:**\n" + "".join(f"- {risk}\n" for risk in fs['identifiedRisks']))
    # Clarity and Communication
    cc = critique.get('clarityAndCommunication', {})
    lines.append(f"\n**Clarity & Communication ({cc.get('rating', '-')}/5):** {cc.get('critique', 'N/A')}\n\n")
    # Recommendation
    ar = critique.get('actionableRecommendation', {})
    lines.append(f"**Actionable Recommendation (Priority: {ar.get('priority', 'N/A')}):**\n")
    lines.append(f"- **Recommendation:** {ar.get('recommendation', 'N/A')}\n")
    lines.append(f"- **Justification:** {ar.get('justification', 'N/A')}\n")
    return "".join(lines)

# ==============================================================================
# 2. MAIN FORMATTING FUNCTION (Now much cleaner)
# ==============================================================================

def format_evaluation_report(report: Dict[str, Any]) -> str:
    """
    Formats the full evaluation report dictionary into a human-readable Markdown string
    using a dispatcher pattern for maintainability.
    """

    # --- The Dispatcher: Maps metric keys to their formatting function ---
    METRIC_FORMATTERS = {
        'compilationSuccess': _format_compilation_success,
        'abstractionAdherence': _format_abstraction_adherence,
        'crossLevelConsistency': _format_cross_level_consistency,
        'securityAssessment': _format_security_assessment,
        'architectCritique': _format_architect_critique,
        # Add other specific formatters here
    }

    parts = ["# 🏁 C4 Model Evaluation Report 🏁\n\n"]

    # --- Header Section ---
    # (Your header logic for metadata, brief name, etc. is good and can be kept here)
    if 'evaluationMetadata' in report:
        # ... your metadata formatting logic ...
        parts.append("## Evaluation Context\n\n")
    if 'brief_name' in report:
        parts.append(f"## System Brief: {report['brief_name']}\n\n")

    parts.append("---\n\n")

    # --- Metrics Section (uses the dispatcher) ---
    ordered_metrics = [
        'compilationSuccess', 'abstractionAdherence', 'crossLevelConsistency',
        'emergentNamingConsistency', 'semanticConsistency', 'qualitativeRubric',
        'architectCritique', 'securityAssessment'
    ]

    for metric_key in ordered_metrics:
        if metric_key in report:
            metric_data = report[metric_key]
            # Look up the correct formatter, or use the default one
            formatter = METRIC_FORMATTERS.get(metric_key, _format_default)
            parts.append(formatter(metric_data))
            parts.append("---\n\n")

    return "".join(parts)

### Run all evaluations

In [54]:
import os
import json

def run_all_evaluations(
    experiment_results: list,
    experiment_config: dict,
    save_c4_artifacts_func: callable,
    run_full_evaluation_func: callable,
    save_all_evaluation_reports_func: callable,
    format_evaluation_report_func: callable,
    judge_model_name: str = "gemini-1.5-pro-latest",
) -> dict:
    """
    Processes each experiment result completely (saves artifacts, runs evaluation,
    saves reports) in a single, efficient loop.
    """
    all_evaluation_reports = {}
    experiment_name = experiment_config["name"]

    print("\n" + "#"*80)
    print(f"### ✨ PROCESSING ALL RESULTS FOR EXPERIMENT: {experiment_name} ✨ ###")
    print(f"### Judge Model: {judge_model_name}")
    print("#"*80 + "\n")

    # <<< STRUCTURED: Process each result completely in a single loop >>>
    for result in experiment_results:
        # 1. EXTRACT METADATA
        brief_name = result['brief_name']
        thread_id = result['thread_id']
        system_brief = result['system_brief_content']
        final_c4_model = result['final_c4_model']

        print(f"\n{'='*80}")
        print(f"--- Processing Run: '{brief_name}' (Thread: {thread_id}) ---")
        print(f"{'='*80}\n")

        # 2. DEFINE STRUCTURED OUTPUT PATHS
        # Create a single, neat directory for this entire result
        result_base_dir = f"evaluation_results_openai/{experiment_name}/{brief_name}_{thread_id}"
        artifacts_output_dir = f"{result_base_dir}/c4_artifacts"
        reports_output_dir = f"{result_base_dir}/evaluation_reports"
        os.makedirs(artifacts_output_dir, exist_ok=True)
        os.makedirs(reports_output_dir, exist_ok=True)

        # 3. SAVE THE GENERATED C4 ARTIFACTS
        print(f"💾 Saving C4 model artifacts to: {artifacts_output_dir}")
        # <<< FIXED: The missing function call is now here >>>
        save_c4_artifacts_func(artifacts_output_dir, final_c4_model)
        print("--- ✅ C4 Artifacts saved. ---")

        # 4. RUN THE EVALUATION
        print(f"\n🔬 Running evaluation with judge model '{judge_model_name}'...")
        current_report = run_full_evaluation_func(system_brief, final_c4_model, judge_model_name)
        print("--- ✅ Evaluation complete. ---")

        # 5. ENRICH AND SAVE THE EVALUATION REPORT
        # Add metadata for context
        current_report['experiment_config'] = experiment_config
        current_report['brief_name'] = brief_name
        current_report['thread_id'] = thread_id
        all_evaluation_reports[thread_id] = current_report

        # Save the JSON report
        report_json_filepath = f"{reports_output_dir}/evaluation_report.json"
        print(f"\n💾 Saving evaluation reports to: {reports_output_dir}")
        with open(report_json_filepath, 'w', encoding='utf-8') as f:
            json.dump(current_report, f, indent=2)
        print(f"  - Saved evaluation_report.json")

        # Save the formatted Markdown report
        # ... (Your logic for formatted reports) ...
        formatted_report_content = format_evaluation_report_func(current_report)
        with open(f"{reports_output_dir}/evaluation_report.md", 'w', encoding='utf-8') as f:
            f.write(formatted_report_content)
        print(f"  - Saved evaluation_report.md")
        print("--- ✅ Evaluation reports saved. ---")

    print("\n\n" + "#"*80)
    print(f"### 🎉 ALL RESULTS PROCESSED FOR: {experiment_name}! ###")
    print("#"*80 + "\n")

    # --- SAVE THE CONSOLIDATED REPORT FOR THIS ENTIRE EXPERIMENT ---
    consolidated_output_dir = f"evaluation_results_openai/{experiment_name}"
    save_all_evaluation_reports_func(
        all_evaluation_reports,
        output_dir=consolidated_output_dir
    )

    return all_evaluation_reports

# Experiments

### Briefs examples

In [None]:

# --- Define your system briefs in a dictionary (same as before) ---
system_briefs = {
    "Library Management System (LMS)": """
title: Library Management System
description: >
  A solution for public and academic libraries to catalogue items,
  manage circulation, and provide self-service portals for patrons.
domain: Education / Library Services
constraints:
  - "EU-only data residency"
  - "Open-source tech stack preferred"
  - "Must support bilingual UI (EN/FR)"
functional_requirements:
  - id: R-01
    desc: "Catalogue physical & digital items with MARC metadata"
  - id: R-02
    desc: "Patron self-checkout & returns via kiosks or mobile app"
  - id: R-03
    desc: "Search & faceted browse with <1 s median latency"
  - id: R-04
    desc: "Automated overdue notices by email/SMS"
nonfunctional_requirements:
  - id: R-05
    quality: Performance
    desc: "99th-percentile search response < 800 ms"
  - id: R-06
    quality: Availability
    desc: "Service uptime ≥ 99.5%"
  - id: R-07
    quality: Security
    desc: "Role-based access; yearly PEN tests"
  - id: R-08
    quality: Scalability
    desc: "Catalogue up to 1 million items"
target_cloud:
  provider: Azure
  regions:
    - westeurope
""",
    "NextGen Point-of-Sale (POS)": """
title: NextGen Point-of-Sale
description: >
  Store-front POS inspired by Craig Larman’s case study; supports bar-code
  scanning, promotions, and offline queueing when networks fails.
domain: Retail / Point-of-Sale
constraints:
  - "PCI-DSS Level 1 compliance"
  - "Offline transaction buffering ≤ 24 h"
functional_requirements:
  - id: R-01
    desc: "Scan items & compute totals with tax rules per locale"
  - id: R-02
    desc: "Apply promotions & loyalty points in real time"
  - id: R-03
    desc: "Process card payments via Stripe Terminal"
  - id: R-04
    desc: "Print or email receipt with QR-code"
nonfunctional_requirements:
  - id: R-05
    quality: Performance
    desc: "Complete sale in ≤ 500 ms P95"
  - id: R-06
    quality: Availability
    desc: "Uptime ≥ 99.9%"
  - id: R-07
    quality: Reliability
    desc: "No lost sales during network outages"
  - id: R-08
    quality: Usability
    desc: "Cashier workflow ≤ 4 clicks"
target_cloud:
  provider: AWS
  regions:
    - us-east-1
    - eu-west-1
""",
    "Online Bookstore (Mini-Amazon)": """
title: Online Bookstore
description: >
  A scaled-down Amazon-style e-commerce site for buying physical and
  electronic books with recommendations and reviews.
domain: Retail / E-commerce
constraints:
  - "GDPR & CCPA compliance"
  - "Multi-currency (USD, EUR, GBP)"
  - "Integrate with third-party shipping APIs"
functional_requirements:
  - id: R-01
    desc: "Browse & keyword search the catalogue"
  - id: R-02
    desc: "Shopping cart & secure checkout"
  - id: R-03
    desc: "Customer reviews & 5-star ratings"
  - id: R-04
    desc: "‘Customers also bought’ recommendations"
nonfunctional_requirements:
  - id: R-05
    quality: Scalability
    desc: "Handle 1 M MAU without degradation"
  - id: R-06
    quality: Performance
    desc: "Page load < 2 s on 3G"
  - id: R-07
    quality: Availability
    desc: "99.95% uptime"
  - id: R-08
    quality: Security
    desc: "OWASP Top-10 mitigations"
target_cloud:
  provider: AWS
  regions:
    - us-east-1
    - eu-west-1
    - ap-southeast-1
""",
    "Student Information System (SIS)": """
title: Student Information System
description: >
  Central system for universities to manage student records, enrollment,
  grades, and transcripts across multiple campuses.
domain: Education / Administration
constraints:
  - "FERPA compliance (US) & GDPR (EU)"
  - "Multi-campus tenancy"
functional_requirements:
  - id: R-01
    desc: "Maintain student demographic & academic records"
  - id: R-02
    desc: "Online course enrollment & wait-listing"
  - id: R-03
    desc: "Faculty grade submission & change history"
  - id: R-04
    desc: "Generate official transcripts (PDF)"
nonfunctional_requirements:
  - id: R-05
    quality: Security
    desc: "Field-level encryption for PII"
  - id: R-06
    quality: Integrity
    desc: "Immutable audit logs for grade changes"
  - id: R-07
    quality: Availability
    desc: "Uptime ≥ 99.8% during term"
  - id: R-08
    quality: Maintainability
    desc: "≤ 20% mean time to repair (MTTR) per incident"
target_cloud:
  provider: GCP
  regions:
    - us-east1
    - europe-west4
""",
    "Clinic Management System": """
title: Clinic Management System
description: >
  Manages patient admissions, electronic medical records, scheduling,
  and billing for medium-sized hospitals and clinics.
domain: Healthcare / Clinical IT
constraints:
  - "HIPAA & GDPR compliance"
  - "High availability 99.99%"
  - "Data retention ≥ 10 years"
functional_requirements:
  - id: R-01
    desc: "Patient registration & demographic capture"
  - id: R-02
    desc: "Appointment scheduling with resource clash checks"
  - id: R-03
    desc: "Electronic Medical Record (EMR) with audit trail"
  - id: R-04
    desc: "Billing & insurance claim submission"
nonfunctional_requirements:
  - id: R-05
    quality: Security
    desc: "Access via multi-factor auth; AES-256 at rest"
  - id: R-06
    quality: Availability
    desc: "Uptime ≥ 99.99% (active-active)"
  - id: R-07
    quality: Performance
    desc: "EMR screen load < 1 s P95"
  - id: R-08
    quality: Interoperability
    desc: "HL7 FHIR APIs for lab & imaging systems"
target_cloud:
  provider: Hybrid
  regions:
    - on-prem-k8s
    - eu-central-1
"""
}

## Run all experiments and evaluation

In [60]:
import shutil
import os
import platform
import subprocess

def zip_folder_with_increment(folder_to_zip: str, base_name: str = "evaluation_results_openai") -> str:
    """
    Zips the specified folder and saves it with an incremented filename if needed.

    Args:
        folder_to_zip (str): Path to the folder to zip.
        base_name (str): Base name for the zip file (default is 'evaluation_results').

    Returns:
        str: The full path to the created zip file.
    """
    zip_filename = f"{base_name}.zip"
    counter = 1

    # Find a non-conflicting filename
    while os.path.exists(zip_filename):
        zip_filename = f"{base_name}_{counter}.zip"
        counter += 1

    zip_base_name = zip_filename.replace('.zip', '')

    print(f"Zipping the folder: '{folder_to_zip}' into '{zip_filename}'...")

    try:
        shutil.make_archive(zip_base_name, 'zip', folder_to_zip)
        print(f"✅ Successfully created zip file: '{zip_filename}'")

        abs_path = os.path.abspath(zip_filename)
        print(f"📁 You can find the zipped file here: {abs_path}")

        # Optional: open folder in file explorer
        folder_path = os.path.dirname(abs_path)
        if platform.system() == "Windows":
            os.startfile(folder_path)
        elif platform.system() == "Darwin":  # macOS
            subprocess.run(["open", folder_path])
        elif platform.system() == "Linux":
            subprocess.run(["xdg-open", folder_path])

        return abs_path

    except FileNotFoundError:
        print(f"❌ Error: The directory '{folder_to_zip}' was not found.")
        return ""
    except Exception as e:
        print(f"⚠️ An error occurred: {e}")
        return ""

# Example usage:
# zip_folder_with_increment("evaluation_results")


### GPT experiment

#### Generate

In [None]:
# --- 1. Define the configurations for your experiments ---
from langgraph.checkpoint.memory import InMemorySaver



# Instead of a simple list of models, we create a list of dictionaries.
# Each dictionary is a complete configuration for a single experiment run.
# This makes it easy to test different models, methods, and parameters.
EXPERIMENT_CONFIGS = [
    {
        "name": "GPT4omini_Collaborative_3_Rounds",
        "model_name": "gpt-4o-mini",
        "analysis_method": "collaborative",
        "collab_rounds": 3,
    },
    # {
    #     "name": "GPT4omini_Collaborative_1_Rounds",
    #     "model_name": "gpt-4o-mini",
    #     "analysis_method": "collaborative",
    #     "collab_rounds": 1,
    # },
    # {
    #     "name": "GPT4o_Collaborative_3_Rounds",
    #     "model_name": "gpt-4o",
    #     "analysis_method": "collaborative",
    #     "collab_rounds": 3,
    # },
    # {
    #     "name": "GPT4o_Collaborative_1_Rounds",
    #     "model_name": "gpt-4o",
    #     "analysis_method": "collaborative",
    #     "collab_rounds": 1,
    # },
    # {
    #     "name": "GPT4o_Mini_Simple",
    #     "model_name": "gpt-4o-mini",
    #     "analysis_method": "simple",
    #     "collab_rounds": None,  # Not applicable for the 'simple' method
    # },
    # {
    #     "name": "GPT4o_Simple",
    #     "model_name": "gpt-4o",
    #     "analysis_method": "simple",
    #     "collab_rounds": None,  # Not applicable for the 'simple' method
    # },
]

# --- 2. Run the experiments by looping through the configurations ---
all_experiment_results = {}

checkpointer = InMemorySaver()

# We now loop through our list of configuration dictionaries.
for config in EXPERIMENT_CONFIGS:
    experiment_name = config["name"]

    print("\n" + "="*60)
    print(f"🚀 Starting Experiment: {experiment_name}")
    print(f"   - Model: {config['model_name']}")
    print(f"   - Method: {config['analysis_method']}")
    if config['analysis_method'] == 'collaborative':
        print(f"   - Rounds: {config['collab_rounds']}")
    print("="*60)

    # A. Create a dedicated app instance using the full configuration.
    # We pass all the parameters from our config dictionary.
    current_app = create_c4_modeler_graph(
        checkpointer=checkpointer,
        model_name=config["model_name"],
        analysis_method=config["analysis_method"],
        collab_rounds=config["collab_rounds"]
    )

    # B. Run your existing experiment function against this specific app.
    model_specific_results = run_all_experiments(
        app_instance=current_app,
        system_briefs_data=system_briefs
    )

    # C. Store the results using the unique experiment name as the key.
    all_experiment_results[experiment_name] = model_specific_results

    print(f"✅ Finished experiment: {experiment_name}.")


print("\n\n🎉 All experiments completed for all configurations.")


🚀 Starting Experiment: GPT4o_Mini_Simple
   - Model: gpt-4o-mini
   - Method: simple
--- 🏗️ Building graph with model: 'gpt-4o-mini' and analysis: 'simple' ---
--- ⚙️  Instantiating model: gpt-4o-mini ---
✅ LangGraph C4 Modeler compiled successfully with checkpointer!

--- 🚀 Starting C4 Model Generation Experiments ---


--- Processing: Library Management System (LMS) ---


--- LangGraph Thread ID: 20250612-141410-library-management-system-lms-a77be777 ---
--- ✍️ Generating Context Level Analysis ---

Node: analysis
--- 📝 Generating Context Level YAML ---

Node: yaml
--- 🎨 Generating Context Level Diagram ---

Node: diagram
--- ✍️ Generating Container Level Analysis ---

Node: analysis
--- 📝 Generating Container Level YAML ---

Node: yaml
--- 🎨 Generating Container Level Diagram ---

Node: diagram
--- ⚙️ Populating Component Queue ---
Found containers to process: ['Web Application Container', 'API Container', 'Database Container', 'Notification Service Container']
--- 🤔 Checking Compo

In [58]:
# all_experiment_results['Gemini_Flash_Collaborative_2_Rounds'][0]['final_c4_model']

#### Eval

In [59]:
# --- 1. Define Your Judge Model ---
JUDGE_MODEL = "gemini-2.5-flash-preview-05-20"


# --- 2. Run Evaluations by Looping Through Experiment Configs ---
print("\n" + "="*60)
print(f"🔬 Running All Evaluations (Judge: {JUDGE_MODEL})")
print("="*60)

final_evaluation_summaries = {}

# <<< CHANGED: Loop through the full configs, not just model names >>>
for config in EXPERIMENT_CONFIGS:
    experiment_name = config["name"]
    print(f"\n--- Evaluating Experiment: {experiment_name} ---")

    # Get the corresponding results for this specific experiment run
    experiment_results = all_experiment_results.get(experiment_name)

    if not experiment_results:
        print(f"⚠️  Warning: No results found for '{experiment_name}'. Skipping evaluation.")
        continue

    # <<< CHANGED: Pass the entire config dictionary to the evaluator >>>
    # This gives the function all the context it needs.
    final_evaluation_summary = run_all_evaluations(
        experiment_results=experiment_results,
        experiment_config=config, # Pass the whole config
        save_c4_artifacts_func=save_c4_artifacts,
        run_full_evaluation_func=run_full_evaluation,
        save_all_evaluation_reports_func=save_all_evaluation_reports,
        format_evaluation_report_func=format_evaluation_report,
        judge_model_name=JUDGE_MODEL
    )

    final_evaluation_summaries[experiment_name] = final_evaluation_summary

    
    zip_folder_with_increment("evaluation_results_openai")

print("\n\n🎉 All evaluations completed.")


🔬 Running All Evaluations (Judge: gemini-2.5-flash-preview-05-20)

--- Evaluating Experiment: GPT4o_Mini_Simple ---

################################################################################
### ✨ PROCESSING ALL RESULTS FOR EXPERIMENT: GPT4o_Mini_Simple ✨ ###
### Judge Model: gemini-2.5-flash-preview-05-20
################################################################################


--- Processing Run: 'Library Management System (LMS)' (Thread: 20250612-141410-library-management-system-lms-a77be777) ---

💾 Saving C4 model artifacts to: evaluation_results_openai/GPT4o_Mini_Simple/Library Management System (LMS)_20250612-141410-library-management-system-lms-a77be777/c4_artifacts
  - Saved 1_context_analysis.md
  - Saved 1_context_definition.yaml
  - Saved 1_context_diagram.puml
  - Saved 2_container_analysis.md
  - Saved 2_container_definition.yaml
  - Saved 2_container_diagram.puml
  - Saved web_application_container_analysis.md
  - Saved web_application_container_definitio

Key 'parameters' is not supported in schema, ignoring



--- Running Holistic Expert Critiques ---
⚖️ Evaluating Metric 7: Principal Architect's Critique...


Key 'parameters' is not supported in schema, ignoring


🛡️  Evaluating Metric 8: Security 'Red Team' Assessment...


📋 FINAL EVALUATION REPORT (Judge: gemini-2.5-flash-preview-05-20) 📋

{
  "evaluationMetadata": {
    "judgeModel": "gemini-2.5-flash-preview-05-20",
    "judgeModelTemperature": 0.0,
    "evaluationTimestamp": "2025-06-12T14:17:32.895762"
  },
  "compilationSuccess": {
    "metric": "Compilation Success Rate",
    "score": 100.0,
    "successful": 6,
    "total": 6,
    "details": [
      {
        "source": "1_Context",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "2_Containers",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "3_Component_Web Application Container",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "3_Component_API Container",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "3_Component_Database Container",
        "status": "Compiled",
        "error":

### Google experiment

In [None]:
# --- 1. Define the configurations for your experiments ---
from langgraph.checkpoint.memory import InMemorySaver


# Instead of a simple list of models, we create a list of dictionaries.
# Each dictionary is a complete configuration for a single experiment run.
# This makes it easy to test different models, methods, and parameters.
EXPERIMENT_CONFIGS = [
    {
        "name": "Gemini_Flash_Collaborative_3_Rounds",
        "model_name": "gemini-2.5-flash-preview-05-20",
        "analysis_method": "collaborative",
        "collab_rounds": 3,
    },
    {
        "name": "Gemini_Flash_Collaborative_1_Rounds",
        "model_name": "gemini-2.5-flash-preview-05-20",
        "analysis_method": "collaborative",
        "collab_rounds": 1,
    },
    {
        "name": "Gemini_Flash_Simple",
        "model_name": "gemini-2.5-flash-preview-05-20",
        "analysis_method": "simple",
        "collab_rounds": None,  # Not applicable for the 'simple' method
    },
]

# --- 2. Run the experiments by looping through the configurations ---
all_experiment_results = {}

checkpointer = InMemorySaver()

# We now loop through our list of configuration dictionaries.
for config in EXPERIMENT_CONFIGS:
    experiment_name = config["name"]

    print("\n" + "="*60)
    print(f"🚀 Starting Experiment: {experiment_name}")
    print(f"   - Model: {config['model_name']}")
    print(f"   - Method: {config['analysis_method']}")
    if config['analysis_method'] == 'collaborative':
        print(f"   - Rounds: {config['collab_rounds']}")
    print("="*60)

    # A. Create a dedicated app instance using the full configuration.
    # We pass all the parameters from our config dictionary.
    current_app = create_c4_modeler_graph(
        checkpointer=checkpointer,
        model_name=config["model_name"],
        analysis_method=config["analysis_method"],
        collab_rounds=config["collab_rounds"]
    )

    # B. Run your existing experiment function against this specific app.
    model_specific_results = run_all_experiments(
        app_instance=current_app,
        system_briefs_data=system_briefs
    )

    # C. Store the results using the unique experiment name as the key.
    all_experiment_results[experiment_name] = model_specific_results

    print(f"✅ Finished experiment: {experiment_name}.")

print("\n\n🎉 All experiments completed for all configurations.")

In [None]:
# --- 1. Define Your Judge Model ---
JUDGE_MODEL = "gemini-2.5-flash-preview-05-20"


# --- 2. Run Evaluations by Looping Through Experiment Configs ---
print("\n" + "="*60)
print(f"🔬 Running All Evaluations (Judge: {JUDGE_MODEL})")
print("="*60)

final_evaluation_summaries = {}

# <<< CHANGED: Loop through the full configs, not just model names >>>
for config in EXPERIMENT_CONFIGS:
    experiment_name = config["name"]
    print(f"\n--- Evaluating Experiment: {experiment_name} ---")

    # Get the corresponding results for this specific experiment run
    experiment_results = all_experiment_results.get(experiment_name)

    if not experiment_results:
        print(f"⚠️  Warning: No results found for '{experiment_name}'. Skipping evaluation.")
        continue

    # <<< CHANGED: Pass the entire config dictionary to the evaluator >>>
    # This gives the function all the context it needs.
    final_evaluation_summary = run_all_evaluations(
        experiment_results=experiment_results,
        experiment_config=config, # Pass the whole config
        save_c4_artifacts_func=save_c4_artifacts,
        run_full_evaluation_func=run_full_evaluation,
        save_all_evaluation_reports_func=save_all_evaluation_reports,
        format_evaluation_report_func=format_evaluation_report,
        judge_model_name=JUDGE_MODEL
    )

    final_evaluation_summaries[experiment_name] = final_evaluation_summary

print("\n\n🎉 All evaluations completed.")

In [None]:
# prompt: write a code to zip and download evaluation_results folder

import shutil
from google.colab import files
import os

folder_to_zip = 'evaluation_results'
zip_filename = 'evaluation_results.zip'

print(f"Zipping the folder: '{folder_to_zip}'...")

try:
    shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', folder_to_zip)
    print(f"Successfully created zip file: '{zip_filename}'")
    print("Starting download... Please wait.")
    print("Note: If the download doesn't start, check if your browser is blocking pop-ups for this site.")

    files.download(zip_filename)

except FileNotFoundError:
    print(f"Error: The directory '{folder_to_zip}' was not found. Please ensure the previous script ran successfully.")
except Exception as e:
    print(f"An error occurred: {e}")


### DeepSeek experiment

In [None]:
# --- 1. Define the configurations for your experiments ---
from langgraph.checkpoint.memory import InMemorySaver



# Instead of a simple list of models, we create a list of dictionaries.
# Each dictionary is a complete configuration for a single experiment run.
# This makes it easy to test different models, methods, and parameters.
EXPERIMENT_CONFIGS = [
    {
        "name": "DeepSeek_Collaborative_3_Rounds",
        "model_name": "deepseek-chat",
        "analysis_method": "collaborative",
        "collab_rounds": 3,
    },
    {
        "name": "DeepSeek_Collaborative_1_Rounds",
        "model_name": "deepseek-chat",
        "analysis_method": "collaborative",
        "collab_rounds": 1,
    },
    {
        "name": "DeepSeek_Simple",
        "model_name": "deepseek-chat",
        "analysis_method": "simple",
        "collab_rounds": None,  # Not applicable for the 'simple' method
    },
]

# --- 2. Run the experiments by looping through the configurations ---
all_experiment_results = {}

checkpointer = InMemorySaver()

# We now loop through our list of configuration dictionaries.
for config in EXPERIMENT_CONFIGS:
    experiment_name = config["name"]

    print("\n" + "="*60)
    print(f"🚀 Starting Experiment: {experiment_name}")
    print(f"   - Model: {config['model_name']}")
    print(f"   - Method: {config['analysis_method']}")
    if config['analysis_method'] == 'collaborative':
        print(f"   - Rounds: {config['collab_rounds']}")
    print("="*60)

    # A. Create a dedicated app instance using the full configuration.
    # We pass all the parameters from our config dictionary.
    current_app = create_c4_modeler_graph(
        checkpointer=checkpointer,
        model_name=config["model_name"],
        analysis_method=config["analysis_method"],
        collab_rounds=config["collab_rounds"]
    )

    # B. Run your existing experiment function against this specific app.
    model_specific_results = run_all_experiments(
        app_instance=current_app,
        system_briefs_data=system_briefs
    )

    # C. Store the results using the unique experiment name as the key.
    all_experiment_results[experiment_name] = model_specific_results

    print(f"✅ Finished experiment: {experiment_name}.")

print("\n\n🎉 All experiments completed for all configurations.")

In [None]:
# --- 1. Define Your Judge Model ---
JUDGE_MODEL = "gemini-2.5-flash-preview-05-20"


# --- 2. Run Evaluations by Looping Through Experiment Configs ---
print("\n" + "="*60)
print(f"🔬 Running All Evaluations (Judge: {JUDGE_MODEL})")
print("="*60)

final_evaluation_summaries = {}

# <<< CHANGED: Loop through the full configs, not just model names >>>
for config in EXPERIMENT_CONFIGS:
    experiment_name = config["name"]
    print(f"\n--- Evaluating Experiment: {experiment_name} ---")

    # Get the corresponding results for this specific experiment run
    experiment_results = all_experiment_results.get(experiment_name)

    if not experiment_results:
        print(f"⚠️  Warning: No results found for '{experiment_name}'. Skipping evaluation.")
        continue

    # <<< CHANGED: Pass the entire config dictionary to the evaluator >>>
    # This gives the function all the context it needs.
    final_evaluation_summary = run_all_evaluations(
        experiment_results=experiment_results,
        experiment_config=config, # Pass the whole config
        save_c4_artifacts_func=save_c4_artifacts,
        run_full_evaluation_func=run_full_evaluation,
        save_all_evaluation_reports_func=save_all_evaluation_reports,
        format_evaluation_report_func=format_evaluation_report,
        judge_model_name=JUDGE_MODEL
    )

    final_evaluation_summaries[experiment_name] = final_evaluation_summary

print("\n\n🎉 All evaluations completed.")

# Gathering results

In [62]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.0-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.0-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.1 MB 1.2 MB/s eta 0:00:09
   -- ------------------------------------- 0.8/11.1 MB 1.2 MB/s eta 0:00:09
   --- ------------------------------------ 1.0/11.1 MB 1.4 MB/s eta 0:00:08
   ----- ---------------------------------- 1.6/11.1 MB 1.5 MB/s eta 0:00:07
   ------- -------------------------------- 2.1/11.1 MB 1.7 MB/s eta 0:00:06
   --------- ------------------------------ 2.6

In [92]:
import json
import pandas as pd
from pathlib import Path
from typing import Dict, Any, List

def extract_scores_from_report(report: Dict[str, Any]) -> Dict[str, float]:
    """
    Extracts all relevant metric scores from a single, complex report dictionary,
    handling nested structures and averaging where necessary.
    """
    scores = {}

    # Layer 1: Holistic Metrics
    scores['compilation_success'] = report.get('compilationSuccess', {}).get('score')
    scores['abstraction_adherence'] = report.get('abstractionAdherence', {}).get('score')
    scores['completeness'] = report.get('missingInformation', {}).get('score')
    scores['naming_consistency'] = report.get('emergentNamingConsistency', {}).get('score')

    # Layer 2: Level-Specific Metrics
    scores['semantic_consistency'] = report.get('contextEvaluation', {}).get('semanticConsistency', {}).get('score')
    scores['definitional_consistency_container'] = report.get('containerEvaluation', {}).get('definitionalConsistency', {}).get('score')

    # Average the qualitative rubrics across all levels
    qualitative_scores = []
    for eval_key in ['contextEvaluation', 'containerEvaluation']:
        if report.get(eval_key, {}).get('qualitativeRubric', {}).get('average_score') is not None:
            qualitative_scores.append(report[eval_key]['qualitativeRubric']['average_score'])

    for comp_eval in report.get('componentEvaluations', {}).values():
        if comp_eval.get('qualitativeRubric', {}).get('average_score') is not None:
            qualitative_scores.append(comp_eval['qualitativeRubric']['average_score'])
    
    if qualitative_scores:
        scores['qualitative_rubric_avg'] = sum(qualitative_scores) / len(qualitative_scores)
    else:
        scores['qualitative_rubric_avg'] = None

    # Layer 3: Expert Critiques (extracting a key rating)
    scores['architect_feasibility_rating'] = report.get('architectCritique', {}).get('critique', {}).get('feasibilityAndSoundness', {}).get('rating')
    scores['security_risk_score'] = report.get('securityAssessment', {}).get('assessment', {}).get('overallRiskScore')

    # Clean up None values
    return {k: v for k, v in scores.items() if v is not None}


def gather_and_process_results(base_dir: str = ".") -> pd.DataFrame:
    """
    Finds all experiment summary files, parses them, and aggregates the results
    into a single pandas DataFrame.
    """
    all_flat_results: List[Dict[str, Any]] = []
    
    # Use rglob to recursively find all summary files
    summary_files = list(Path(base_dir).rglob('all_evaluation_summary.json'))
    print(f"Found {len(summary_files)} experiment summary files to process.\n")

    for file_path in summary_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # The experiment name is the name of the parent directory
            experiment_name = file_path.parent.name
            
            # Extract metadata from the experiment name
            parts = experiment_name.split('_')
            model = parts[0]
            method = parts[1]
            rounds = parts[2] if len(parts) > 3 else 'N/A'

            # Process each test case within the file
            for run_id, report in data.items():
                flat_result = {
                    "experiment_name": experiment_name,
                    "model": model,
                    "method": method,
                    "rounds": rounds,
                    "run_id": run_id
                }
                
                # Extract scores from the report
                scores = extract_scores_from_report(report)
                flat_result.update(scores)
                
                all_flat_results.append(flat_result)

        except (json.JSONDecodeError, KeyError) as e:
            print(f"Could not process file {file_path}: {e}")
    
    return pd.DataFrame(all_flat_results)


if __name__ == "__main__":
    # Process all results from the current directory downwards
    results_df = gather_and_process_results()

    if not results_df.empty:
        # Define the columns we want to average
        score_columns = [
            'compilation_success',
            'abstraction_adherence',
            'completeness',
            'naming_consistency',
            'semantic_consistency',
            # 'definitional_consistency_container',
            # 'qualitative_rubric_avg',
            'architect_feasibility_rating',
            'security_risk_score'
        ]
        
        # Ensure all score columns exist, fill missing with NaN
        for col in score_columns:
            if col not in results_df.columns:
                results_df[col] = pd.NA

        # Group by the experiment configuration and calculate the mean for score columns
        summary = results_df.groupby(['model', 'method', 'rounds'])[score_columns].mean().round(2)

        print("--- Average Scores Across All Test Cases ---")
        # Set pandas display options for better viewing
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 200)

        print(summary)
    else:
        print("No results were processed. Please check the directory structure.")

Found 12 experiment summary files to process.

--- Average Scores Across All Test Cases ---
                                    compilation_success  abstraction_adherence  completeness  naming_consistency  semantic_consistency  architect_feasibility_rating  security_risk_score
model         method        rounds                                                                                                                                                       
GPT4o         Collaborative 1                     89.78                 100.00        100.00               16.46                 25.62                           2.8                 31.6
                            3                    100.00                 100.00        100.00                8.24                 34.12                           3.6                 36.6
              Simple        N/A                   80.11                  97.50        100.00               60.85                 51.71                           4.0

In [93]:
import json
import pandas as pd
from pathlib import Path
from typing import Dict, Any, List

def extract_scores_from_report(report: Dict[str, Any]) -> Dict[str, float]:
    """
    Extracts all relevant metric scores from a single, complex report dictionary,
    handling nested structures and averaging where necessary.
    """
    scores = {}

    # Layer 1: Holistic Metrics
    scores['compilation_success'] = report.get('compilationSuccess', {}).get('score')
    scores['abstraction_adherence'] = report.get('abstractionAdherence', {}).get('score')
    scores['completeness'] = report.get('missingInformation', {}).get('score')
    scores['naming_consistency'] = report.get('emergentNamingConsistency', {}).get('score')

    # Layer 2: Level-Specific Metrics
    scores['semantic_consistency'] = report.get('contextEvaluation', {}).get('semanticConsistency', {}).get('score')
    scores['definitional_consistency_container'] = report.get('containerEvaluation', {}).get('definitionalConsistency', {}).get('score')

    # Average the qualitative rubrics across all levels
    qualitative_scores = []
    for eval_key in ['contextEvaluation', 'containerEvaluation']:
        if report.get(eval_key, {}).get('qualitativeRubric', {}).get('average_score') is not None:
            qualitative_scores.append(report[eval_key]['qualitativeRubric']['average_score'])

    for comp_eval in report.get('componentEvaluations', {}).values():
        if comp_eval.get('qualitativeRubric', {}).get('average_score') is not None:
            qualitative_scores.append(comp_eval['qualitativeRubric']['average_score'])
    
    if qualitative_scores:
        scores['qualitative_rubric_avg'] = sum(qualitative_scores) / len(qualitative_scores)
    else:
        scores['qualitative_rubric_avg'] = None

    # Layer 3: Expert Critiques (extracting key ratings)
    scores['architect_feasibility_rating'] = report.get('architectCritique', {}).get('critique', {}).get('feasibilityAndSoundness', {}).get('rating')
    scores['architect_clarity_communication_rating'] = report.get('architectCritique', {}).get('critique', {}).get('clarityAndCommunication', {}).get('rating')
    scores['security_risk_score'] = report.get('securityAssessment', {}).get('assessment', {}).get('overallRiskScore')

    # Clean up None values
    return {k: v for k, v in scores.items() if v is not None}


def gather_and_process_results(base_dir: str = ".") -> pd.DataFrame:
    """
    Finds all experiment summary files, parses them, and aggregates the results
    into a single pandas DataFrame.
    """
    all_flat_results: List[Dict[str, Any]] = []
    
    # Use rglob to recursively find all summary files
    summary_files = list(Path(base_dir).rglob('all_evaluation_summary.json'))
    print(f"Found {len(summary_files)} experiment summary files to process.\n")

    for file_path in summary_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # The experiment name is the name of the parent directory
            experiment_name = file_path.parent.name
            
            # Extract metadata from the experiment name
            parts = experiment_name.split('_')
            model = parts[0]
            method = parts[1]
            rounds = parts[2] if len(parts) > 3 else 'N/A'

            # Process each test case within the file
            for run_id, report in data.items():
                flat_result = {
                    "experiment_name": experiment_name,
                    "model": model,
                    "method": method,
                    "rounds": rounds,
                    "run_id": run_id
                }
                
                # Extract scores from the report
                scores = extract_scores_from_report(report)
                flat_result.update(scores)
                
                all_flat_results.append(flat_result)

        except (json.JSONDecodeError, KeyError) as e:
            print(f"Could not process file {file_path}: {e}")
    
    return pd.DataFrame(all_flat_results)


if __name__ == "__main__":
    # Process all results from the current directory downwards
    results_df = gather_and_process_results()

    if not results_df.empty:
        # Define the columns we want to average, including the new one
        score_columns = [
            'compilation_success',
            'abstraction_adherence',
            'completeness',
            'naming_consistency',
            'semantic_consistency',
            # 'definitional_consistency_container', # Uncomment if you want to include this in the average
            # 'qualitative_rubric_avg', # Uncomment if you want to include this in the average
            'architect_feasibility_rating',
            'architect_clarity_communication_rating', # Added new metric
            'security_risk_score'
        ]
        
        # Ensure all score columns exist, fill missing with NaN
        for col in score_columns:
            if col not in results_df.columns:
                results_df[col] = pd.NA

        # Group by the experiment configuration and calculate the mean for score columns
        summary = results_df.groupby(['model', 'method', 'rounds'])[score_columns].mean().round(2)

        print("--- Average Scores Across All Test Cases ---")
        # Set pandas display options for better viewing
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 200)

        print(summary)
    else:
        print("No results were processed. Please check the directory structure.")

Found 12 experiment summary files to process.

--- Average Scores Across All Test Cases ---
                                    compilation_success  abstraction_adherence  completeness  naming_consistency  semantic_consistency  architect_feasibility_rating  \
model         method        rounds                                                                                                                                     
GPT4o         Collaborative 1                     89.78                 100.00        100.00               16.46                 25.62                           2.8   
                            3                    100.00                 100.00        100.00                8.24                 34.12                           3.6   
              Simple        N/A                   80.11                  97.50        100.00               60.85                 51.71                           4.0   
GPT4omini     Collaborative 1                     92.14             

In [68]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [71]:
import pandas as pd

def summary_to_llm_readable(summary: pd.DataFrame) -> str:
    """
    Converts the summary DataFrame into a markdown table string suitable for LLM input.
    """
    # Reset index to get model/method/rounds as columns
    df = summary.reset_index()
    # Optionally, rename columns for clarity
    df = df.rename(columns={
        "model": "Model",
        "method": "Method",
        "rounds": "Rounds",
        "compilation_success": "Compilation Success",
        "abstraction_adherence": "Abstraction Adherence",
        "completeness": "Completeness",
        "naming_consistency": "Naming Consistency",
        "semantic_consistency": "Semantic Consistency",
        "architect_feasibility_rating": "Architect Feasibility",
        "security_risk_score": "Security Risk Score"
    })
    # Format floats to 2 decimals
    for col in df.columns:
        if df[col].dtype == float:
            df[col] = df[col].map(lambda x: f"{x:.2f}")
    # Convert to markdown table
    markdown_table = df.to_markdown(index=False)
    # Add a short intro for LLM context
    intro = (
        "## C4 Model Evaluation Summary\n"
        "Below is a summary table of evaluation metrics for each experiment configuration. "
        "Each row represents a unique combination of model, method, and rounds. "
        "Scores are percentages unless otherwise noted (higher is better, except for Security Risk Score where lower is better).\n\n"
    )
    return intro + markdown_table

# Example usage:
llm_summary = summary_to_llm_readable(summary)
print(llm_summary)

## C4 Model Evaluation Summary
Below is a summary table of evaluation metrics for each experiment configuration. Each row represents a unique combination of model, method, and rounds. Scores are percentages unless otherwise noted (higher is better, except for Security Risk Score where lower is better).

| Model         | Method        | Rounds   |   Compilation Success |   Abstraction Adherence |   Completeness |   Naming Consistency |   Semantic Consistency |   Architect Feasibility |   Security Risk Score |
|:--------------|:--------------|:---------|----------------------:|------------------------:|---------------:|---------------------:|-----------------------:|------------------------:|----------------------:|
| DeepSeek      | Collaborative | 1        |                100    |                  100    |         100    |                84.17 |                  45.01 |                     3.4 |                  30.4 |
| DeepSeek      | Simple        | N/A      |                 97.1

In [72]:

# --- Main Evaluation Function (Now much cleaner) ---

def evaluate_cross_level_consistency(c4_model: Dict) -> Dict[str, Any]:
    """Measures two-way consistency of elements across C4 levels."""
    print("🤖 Evaluating Metric: Cross-Level Consistency Check...")

    # 1. Parse all relevant YAML definitions
    context_data = parse_yaml_safe(c4_model.get("context", {}).get("yaml_definition"))
    container_data = parse_yaml_safe(c4_model.get("containers", {}).get("yaml_definition"))
    component_data_map = {
        name: parse_yaml_safe(data.get("yaml_definition", ""))
        for name, data in c4_model.get("components", {}).items()
    }

    all_details = {}

    # 2. Run Context -> Container Check
    if context_data and container_data:
        is_consistent, reason = _check_context_to_container(context_data, container_data)
        all_details["Context->Container"] = {"status": "Pass" if is_consistent else "Fail", "reason": reason}

    # 3. Run Container -> Component Checks
    if container_data and component_data_map:
        component_results = _check_container_to_components(container_data, component_data_map)
        all_details.update(component_results)

    # 4. Calculate Final Score
    total_checks = len(all_details)
    passed_checks = sum(1 for result in all_details.values() if result["status"] == "Pass")
    score = (passed_checks / total_checks) * 100 if total_checks > 0 else 100

    return {
        "metric": "Cross-Level Consistency",
        "score": round(score, 2),
        "passed": passed_checks,
        "total": total_checks,
        "details": all_details
    }

# Eval again

In [84]:

# --- Define your system briefs in a dictionary (same as before) ---
system_briefs = {
    "Library Management System (LMS)": """
title: Library Management System
description: >
  A solution for public and academic libraries to catalogue items,
  manage circulation, and provide self-service portals for patrons.
domain: Education / Library Services
constraints:
  - "EU-only data residency"
  - "Open-source tech stack preferred"
  - "Must support bilingual UI (EN/FR)"
functional_requirements:
  - id: R-01
    desc: "Catalogue physical & digital items with MARC metadata"
  - id: R-02
    desc: "Patron self-checkout & returns via kiosks or mobile app"
  - id: R-03
    desc: "Search & faceted browse with <1 s median latency"
  - id: R-04
    desc: "Automated overdue notices by email/SMS"
nonfunctional_requirements:
  - id: R-05
    quality: Performance
    desc: "99th-percentile search response < 800 ms"
  - id: R-06
    quality: Availability
    desc: "Service uptime ≥ 99.5%"
  - id: R-07
    quality: Security
    desc: "Role-based access; yearly PEN tests"
  - id: R-08
    quality: Scalability
    desc: "Catalogue up to 1 million items"
target_cloud:
  provider: Azure
  regions:
    - westeurope
""",
    "NextGen Point-of-Sale (POS)": """
title: NextGen Point-of-Sale
description: >
  Store-front POS inspired by Craig Larman’s case study; supports bar-code
  scanning, promotions, and offline queueing when networks fails.
domain: Retail / Point-of-Sale
constraints:
  - "PCI-DSS Level 1 compliance"
  - "Offline transaction buffering ≤ 24 h"
functional_requirements:
  - id: R-01
    desc: "Scan items & compute totals with tax rules per locale"
  - id: R-02
    desc: "Apply promotions & loyalty points in real time"
  - id: R-03
    desc: "Process card payments via Stripe Terminal"
  - id: R-04
    desc: "Print or email receipt with QR-code"
nonfunctional_requirements:
  - id: R-05
    quality: Performance
    desc: "Complete sale in ≤ 500 ms P95"
  - id: R-06
    quality: Availability
    desc: "Uptime ≥ 99.9%"
  - id: R-07
    quality: Reliability
    desc: "No lost sales during network outages"
  - id: R-08
    quality: Usability
    desc: "Cashier workflow ≤ 4 clicks"
target_cloud:
  provider: AWS
  regions:
    - us-east-1
    - eu-west-1
""",
    "Online Bookstore (Mini-Amazon)": """
title: Online Bookstore
description: >
  A scaled-down Amazon-style e-commerce site for buying physical and
  electronic books with recommendations and reviews.
domain: Retail / E-commerce
constraints:
  - "GDPR & CCPA compliance"
  - "Multi-currency (USD, EUR, GBP)"
  - "Integrate with third-party shipping APIs"
functional_requirements:
  - id: R-01
    desc: "Browse & keyword search the catalogue"
  - id: R-02
    desc: "Shopping cart & secure checkout"
  - id: R-03
    desc: "Customer reviews & 5-star ratings"
  - id: R-04
    desc: "‘Customers also bought’ recommendations"
nonfunctional_requirements:
  - id: R-05
    quality: Scalability
    desc: "Handle 1 M MAU without degradation"
  - id: R-06
    quality: Performance
    desc: "Page load < 2 s on 3G"
  - id: R-07
    quality: Availability
    desc: "99.95% uptime"
  - id: R-08
    quality: Security
    desc: "OWASP Top-10 mitigations"
target_cloud:
  provider: AWS
  regions:
    - us-east-1
    - eu-west-1
    - ap-southeast-1
""",
    "Student Information System (SIS)": """
title: Student Information System
description: >
  Central system for universities to manage student records, enrollment,
  grades, and transcripts across multiple campuses.
domain: Education / Administration
constraints:
  - "FERPA compliance (US) & GDPR (EU)"
  - "Multi-campus tenancy"
functional_requirements:
  - id: R-01
    desc: "Maintain student demographic & academic records"
  - id: R-02
    desc: "Online course enrollment & wait-listing"
  - id: R-03
    desc: "Faculty grade submission & change history"
  - id: R-04
    desc: "Generate official transcripts (PDF)"
nonfunctional_requirements:
  - id: R-05
    quality: Security
    desc: "Field-level encryption for PII"
  - id: R-06
    quality: Integrity
    desc: "Immutable audit logs for grade changes"
  - id: R-07
    quality: Availability
    desc: "Uptime ≥ 99.8% during term"
  - id: R-08
    quality: Maintainability
    desc: "≤ 20% mean time to repair (MTTR) per incident"
target_cloud:
  provider: GCP
  regions:
    - us-east1
    - europe-west4
""",
    "Clinic Management System": """
title: Clinic Management System
description: >
  Manages patient admissions, electronic medical records, scheduling,
  and billing for medium-sized hospitals and clinics.
domain: Healthcare / Clinical IT
constraints:
  - "HIPAA & GDPR compliance"
  - "High availability 99.99%"
  - "Data retention ≥ 10 years"
functional_requirements:
  - id: R-01
    desc: "Patient registration & demographic capture"
  - id: R-02
    desc: "Appointment scheduling with resource clash checks"
  - id: R-03
    desc: "Electronic Medical Record (EMR) with audit trail"
  - id: R-04
    desc: "Billing & insurance claim submission"
nonfunctional_requirements:
  - id: R-05
    quality: Security
    desc: "Access via multi-factor auth; AES-256 at rest"
  - id: R-06
    quality: Availability
    desc: "Uptime ≥ 99.99% (active-active)"
  - id: R-07
    quality: Performance
    desc: "EMR screen load < 1 s P95"
  - id: R-08
    quality: Interoperability
    desc: "HL7 FHIR APIs for lab & imaging systems"
target_cloud:
  provider: Hybrid
  regions:
    - on-prem-k8s
    - eu-central-1
"""
}

In [85]:
import json
from datetime import datetime
from typing import Dict, Any

# Assume all other functions (get_llm, evaluate_*, check_c4_completeness) are defined
# and that ModelName is a defined type.

def run_full_evaluation(
    system_brief: str,
    c4_model: Dict,
    judge_model_name: Any,
    temperature: float = 0.0
) -> Dict[str, Any]:
    """
    Runs a structured, level-aware evaluation of a C4 model.
    """
    # ... (setup code is unchanged) ...
    judge_llm = get_llm(model_name=judge_model_name, temperature=temperature)
    report = {
        'evaluationMetadata': {
            "judgeModel": judge_model_name,
            "judgeModelTemperature": temperature,
            "evaluationTimestamp": datetime.now().isoformat()
        }
    }

    # --- Layer 1: Holistic Structural & Consistency Metrics ---
    print("--- Running Holistic Structural Checks ---")
    report['compilationSuccess'] = evaluate_compilation_success(c4_model)
    report['abstractionAdherence'] = evaluate_abstraction_adherence(c4_model)
    report['missingInformation'] = check_c4_completeness(c4_model)
    report['emergentNamingConsistency'] = evaluate_emergent_naming_consistency(c4_model)
    # <<< ADDED: The missing cross-level consistency check >>>
    report['crossLevelConsistency'] = evaluate_cross_level_consistency(c4_model)

    # --- Layer 2: Level-Specific Semantic & Qualitative Evaluations ---
    print("\n--- Running Level-Specific Evaluations ---")

    # --- CONTEXT LEVEL EVALUATION ---
    if "context" in c4_model:
        print("  - Evaluating Context Level...")
        context_eval = {}
        context_diag = c4_model["context"].get("diagram")
        if context_diag:
            # The brief IS the source of truth for the context diagram.
            context_eval['semanticConsistency'] = evaluate_semantic_consistency(system_brief, c4_model, judge_llm)
            # The qualitative rubric now gets the brief it needs to judge completeness.
            context_eval['qualitativeRubric'] = evaluate_qualitative_rubric(context_diag, "Context Diagram", system_brief, judge_llm)
        report['contextEvaluation'] = context_eval

    # --- CONTAINER LEVEL EVALUATION ---
    if "containers" in c4_model:
        print("  - Evaluating Container Level...")
        container_eval = {}
        container_diag = c4_model["containers"].get("diagram")
        container_yaml = c4_model["containers"].get("yaml_definition")
        if container_diag and container_yaml:
            # The container YAML is the source of truth for the container diagram.
            container_eval['definitionalConsistency'] = evaluate_definitional_consistency(container_yaml, container_diag, "containers")
            # The qualitative rubric still needs the high-level brief for context.
            container_eval['qualitativeRubric'] = evaluate_qualitative_rubric(container_diag, "Container Diagram", system_brief, judge_llm)
        report['containerEvaluation'] = container_eval

    # --- COMPONENT LEVEL EVALUATION ---
    if "components" in c4_model:
        print("  - Evaluating Component Level(s)...")
        component_evals = {}
        for comp_name, comp_data in c4_model["components"].items():
            comp_diag = comp_data.get("diagram")
            comp_yaml = comp_data.get("yaml_definition")
            if comp_diag and comp_yaml:
                # The component YAML is the source of truth for the component diagram.
                consistency_result = evaluate_definitional_consistency(comp_yaml, comp_diag, "components")
                # For component rubric, the system brief is still the best high-level context we have.
                rubric_result = evaluate_qualitative_rubric(comp_diag, f"Component: {comp_name}", system_brief, judge_llm)
                component_evals[comp_name] = {
                    'definitionalConsistency': consistency_result,
                    'qualitativeRubric': rubric_result
                }
        report['componentEvaluations'] = component_evals

    # --- Layer 3: Holistic Expert Critiques ---
    # These high-level critiques look at the model as a whole, so they run last.
    print("\n--- Running Holistic Expert Critiques ---")
    report['architectCritique'] = evaluate_architect_critique(system_brief, c4_model, judge_llm)
    report['securityAssessment'] = evaluate_security_assessment(system_brief, c4_model, judge_llm)

    print("\n\n" + "="*50)
    print(f"📋 FINAL EVALUATION REPORT (Judge: {judge_model_name}) 📋")
    print("="*50 + "\n")
    print(json.dumps(report, indent=2))

    return report

In [86]:
import os
import glob
import yaml # You might need to run: pip install pyyaml

def load_c4_model_from_artifacts(artifacts_dir: str) -> dict:
    """
    Loads C4 model artifacts from a directory into a dictionary.

    Args:
        artifacts_dir (str): The path to the 'c4_artifacts' directory.

    Returns:
        dict: The reconstructed C4 model dictionary.
    """
    if not os.path.isdir(artifacts_dir):
        print(f"Warning: Artifacts directory not found at {artifacts_dir}")
        return {}

    c4_model = {"context": {}, "containers": {}, "components": {}}

    def read_file_content(filepath):
        """Safely reads content from a file."""
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return f.read()
        except FileNotFoundError:
            return None # Return None if a file doesn't exist
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
            return None

    # --- Load Context Level ---
    c4_model["context"]["analysis"] = read_file_content(os.path.join(artifacts_dir, "1_context_analysis.md"))
    c4_model["context"]["yaml_definition"] = read_file_content(os.path.join(artifacts_dir, "1_context_definition.yaml"))
    c4_model["context"]["diagram"] = read_file_content(os.path.join(artifacts_dir, "1_context_diagram.puml"))

    # --- Load Container Level ---
    c4_model["containers"]["analysis"] = read_file_content(os.path.join(artifacts_dir, "2_container_analysis.md"))
    c4_model["containers"]["yaml_definition"] = read_file_content(os.path.join(artifacts_dir, "2_container_definition.yaml"))
    c4_model["containers"]["diagram"] = read_file_content(os.path.join(artifacts_dir, "2_container_diagram.puml"))

    # --- Load Component Level ---
    component_dir = os.path.join(artifacts_dir, "3_components")
    if os.path.isdir(component_dir):
        # Find all unique component names by looking at the definition files
        yaml_files = glob.glob(os.path.join(component_dir, "*_definition.yaml"))
        
        for yaml_path in yaml_files:
            # Extract the sanitized container name from the filename
            base_name = os.path.basename(yaml_path)
            # e.g., "my_api_container_definition.yaml" -> "my_api_container"
            safe_container_name = base_name.replace('_definition.yaml', '')
            
            # The original container name should be inside the yaml definition itself.
            # This is a more robust way to get the original key.
            original_container_name = safe_container_name # Fallback
            try:
                with open(yaml_path, 'r', encoding='utf-8') as f:
                    # Assuming the YAML has a root key that is the container name
                    comp_data = yaml.safe_load(f)
                    if isinstance(comp_data, dict) and 'container' in comp_data:
                         original_container_name = comp_data['container']

            except Exception as e:
                print(f"Could not parse container name from {yaml_path}: {e}")


            c4_model["components"][original_container_name] = {
                "analysis": read_file_content(os.path.join(component_dir, f"{safe_container_name}_analysis.md")),
                "yaml_definition": read_file_content(yaml_path),
                "diagram": read_file_content(os.path.join(component_dir, f"{safe_container_name}_diagram.puml"))
            }
            
    # Clean up empty keys for tidiness
    c4_model = {k: v for k, v in c4_model.items() if v and any(val is not None for val in v.values())}

    return c4_model

In [87]:


def re_evaluate_all_test_cases(base_results_dir: str, judge_model: Any, briefs_dict: Dict[str, str]):
    """
    Finds all C4 artifacts, loads them, and runs a new evaluation using a robust matching logic.
    """
    print(f"🚀 Starting re-evaluation process with judge: {judge_model} 🚀")

    def is_match(brief_key: str, dir_name: str) -> bool:
        """
        Robustly checks if the directory name corresponds to the brief key.
        - Makes comparison case-insensitive.
        - Removes all non-alphanumeric characters.
        """
        # "NextGen Point-of-Sale (POS)" -> "nextgenpointofsalepos"
        normalized_key = re.sub(r'[^a-zA-Z0-9]', '', brief_key).lower()
        
        # "NextGen_Point-of-Sale_POS_thread_xyz" -> "nextgenpointofsaleposthreadxyz"
        normalized_dir = re.sub(r'[^a-zA-Z0-9]', '', dir_name).lower()
        
        return normalized_key in normalized_dir

    search_pattern = os.path.join(base_results_dir, "**", "c4_artifacts")
    artifact_dirs = glob.glob(search_pattern, recursive=True)

    if not artifact_dirs:
        print(f"❌ No 'c4_artifacts' directories found in '{base_results_dir}'.")
        return

    print(f"Found {len(artifact_dirs)} test cases to re-evaluate.")

    for artifacts_dir in artifact_dirs:
        test_case_path = os.path.dirname(artifacts_dir)
        test_case_name = os.path.basename(test_case_path)
        
        print(f"\n{'='*60}\nProcessing Test Case: {test_case_name}\n{'='*60}")

        # 1. Find the corresponding system brief using the robust matching function
        system_brief = None
        matched_brief_name = None
        for brief_name, brief_content in briefs_dict.items():
            if is_match(brief_name, test_case_name):
                system_brief = brief_content
                matched_brief_name = brief_name
                break
        
        if system_brief:
            print(f"✅ Found matching system brief: '{matched_brief_name}'")
        else:
            print(f"❌ CRITICAL: No matching system brief found for test case '{test_case_name}'. Skipping.")
            continue
        
        # ... (The rest of the script is unchanged) ...

        # 2. Load the C4 model
        c4_model = load_c4_model_from_artifacts(artifacts_dir)
        if not c4_model:
            print(f"❌ CRITICAL: Could not load C4 model from {artifacts_dir}. Skipping.")
            continue
        print("✅ C4 model loaded successfully.")

        # 3. Run the evaluation
        new_evaluation_report = run_full_evaluation(
            system_brief=system_brief,
            c4_model=c4_model,
            judge_model_name=judge_model
        )

        # 4. Save the new report
        report_filename = f"evaluation_report_judged_by_{str(judge_model).replace('-', '_')}.json"
        report_path = os.path.join(test_case_path, report_filename)
        with open(report_path, 'w', encoding='utf-8') as f:
            json.dump(new_evaluation_report, f, indent=4)
        print(f"✅ Successfully saved new evaluation report to: {report_path}")


if __name__ == '__main__':
    # --- Your System Briefs ---


    # --- Configuration ---
    providers = ['gemini', 'openai', 'grok']
    JUDGE_MODEL_FOR_REEVALUATION = "gemini-2.5-flash-preview-05-20"  # The judge model to use for re-evaluation

    # --- Run the Process ---
    for provider in providers:
        base_dir = f"evaluation_results_{provider}"
        if os.path.isdir(base_dir):
            print(f"\n\n{'#'*80}\n# Starting provider: {provider.upper()}\n{'#'*80}")
            re_evaluate_all_test_cases(base_dir, JUDGE_MODEL_FOR_REEVALUATION, system_briefs)
        else:
            print(f"\nINFO: Directory for provider '{provider}' not found. Skipping.")



################################################################################
# Starting provider: GEMINI
################################################################################
🚀 Starting re-evaluation process with judge: gemini-2.5-flash-preview-05-20 🚀
Found 15 test cases to re-evaluate.

Processing Test Case: Clinic Management System_20250613-115753-clinic-management-system-35ef7531
✅ Found matching system brief: 'Clinic Management System'
✅ C4 model loaded successfully.
--- ⚙️  Instantiating model: gemini-2.5-flash-preview-05-20 ---
--- Running Holistic Structural Checks ---
🤖 Evaluating Metric: PlantUML Compilation Success...
  - ✅ SUCCESS: '1_Context' compiled.
  - ✅ SUCCESS: '2_Containers' compiled.
🤖 Evaluating Metric: C4 Abstraction Adherence...
🤖 Evaluating Metric: C4 Model Completeness...
🤖 Evaluating Metric 6 (New): Emergent Naming Consistency...
🤖 Evaluating Metric: Cross-Level Consistency Check...

--- Running Level-Specific Evaluations ---
  - Evaluating C

Key 'parameters' is not supported in schema, ignoring



--- Running Holistic Expert Critiques ---
⚖️ Evaluating Metric 7: Principal Architect's Critique...


Key 'parameters' is not supported in schema, ignoring


🛡️  Evaluating Metric 8: Security 'Red Team' Assessment...


📋 FINAL EVALUATION REPORT (Judge: gemini-2.5-flash-preview-05-20) 📋

{
  "evaluationMetadata": {
    "judgeModel": "gemini-2.5-flash-preview-05-20",
    "judgeModelTemperature": 0.0,
    "evaluationTimestamp": "2025-06-13T17:08:50.534119"
  },
  "compilationSuccess": {
    "metric": "Compilation Success Rate",
    "score": 100.0,
    "successful": 2,
    "total": 2,
    "details": [
      {
        "source": "1_Context",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "2_Containers",
        "status": "Compiled",
        "error": null
      }
    ]
  },
  "abstractionAdherence": {
    "metric": "Abstraction Adherence",
    "score": 100.0,
    "details": {
      "Context": {
        "status": "Pass",
        "reason": "Adheres to abstraction level."
      },
      "Containers": {
        "status": "Pass",
        "reason": "Adheres to abstraction level."
      }
    }
  },
  "missingInfor

Key 'parameters' is not supported in schema, ignoring



--- Running Holistic Expert Critiques ---
⚖️ Evaluating Metric 7: Principal Architect's Critique...


Key 'parameters' is not supported in schema, ignoring


🛡️  Evaluating Metric 8: Security 'Red Team' Assessment...


📋 FINAL EVALUATION REPORT (Judge: gemini-2.5-flash-preview-05-20) 📋

{
  "evaluationMetadata": {
    "judgeModel": "gemini-2.5-flash-preview-05-20",
    "judgeModelTemperature": 0.0,
    "evaluationTimestamp": "2025-06-13T17:09:46.598520"
  },
  "compilationSuccess": {
    "metric": "Compilation Success Rate",
    "score": 75.0,
    "successful": 3,
    "total": 4,
    "details": [
      {
        "source": "1_Context",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "2_Containers",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "3_Component_account_management",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "3_Component_catalog_search",
        "status": "Failed - Empty",
        "error": "Diagram content was empty or whitespace."
      }
    ]
  },
  "abstractionAdherence": {
    "metric": "Abstraction Adhe

Key 'parameters' is not supported in schema, ignoring



--- Running Holistic Expert Critiques ---
⚖️ Evaluating Metric 7: Principal Architect's Critique...


Key 'parameters' is not supported in schema, ignoring


🛡️  Evaluating Metric 8: Security 'Red Team' Assessment...


📋 FINAL EVALUATION REPORT (Judge: gemini-2.5-flash-preview-05-20) 📋

{
  "evaluationMetadata": {
    "judgeModel": "gemini-2.5-flash-preview-05-20",
    "judgeModelTemperature": 0.0,
    "evaluationTimestamp": "2025-06-13T17:11:04.728735"
  },
  "compilationSuccess": {
    "metric": "Compilation Success Rate",
    "score": 50.0,
    "successful": 3,
    "total": 6,
    "details": [
      {
        "source": "1_Context",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "2_Containers",
        "status": "Compiled",
        "error": null
      },
      {
        "source": "3_Component_inventory_management",
        "status": "Failed - Empty",
        "error": "Diagram content was empty or whitespace."
      },
      {
        "source": "3_Component_reporting",
        "status": "Failed - Empty",
        "error": "Diagram content was empty or whitespace."
      },
      {
        "source": "3

KeyboardInterrupt: 

In [90]:
import os
import glob
import numpy as np
import pandas as pd
from collections import defaultdict

def analyze_and_save_to_pandas(base_search_path: str = "evaluation_results_*"):
    """
    Analyzes C4 artifacts, calculates component statistics, and saves the
    summary to a pandas DataFrame and a CSV file.

    Args:
        base_search_path (str): The base pattern to find provider directories.
    """
    print("🚀 Starting analysis... Will save results to a pandas DataFrame.")

    # --- 1. Data Collection (Same as before) ---
    experiment_counts = defaultdict(lambda: defaultdict(list))
    search_pattern = os.path.join(base_search_path, "**", "c4_artifacts")
    artifact_dirs = glob.glob(search_pattern, recursive=True)

    if not artifact_dirs:
        print("❌ No 'c4_artifacts' directories found. Please check your path.")
        return

    print(f"Found {len(artifact_dirs)} artifact directories to analyze.")

    for artifact_dir in artifact_dirs:
        try:
            path_parts = artifact_dir.split(os.sep)
            provider = path_parts[0].replace("evaluation_results_", "")
            experiment = path_parts[1]
            components_dir = os.path.join(artifact_dir, '3_components')
            component_count = 0
            if os.path.isdir(components_dir):
                definition_files = glob.glob(os.path.join(components_dir, '*_definition.yaml'))
                component_count = len(definition_files)
            experiment_counts[provider][experiment].append(component_count)
        except IndexError:
            print(f"⚠️ Could not parse path: {artifact_dir}. Skipping.")

    # --- 2. Process data for DataFrame ---
    analysis_data = []
    if not experiment_counts:
        print("No data was successfully collected.")
        return

    # Sort for consistent ordering
    sorted_providers = sorted(experiment_counts.keys())
    for provider in sorted_providers:
        sorted_experiments = sorted(experiment_counts[provider].keys())
        for experiment in sorted_experiments:
            counts = experiment_counts[provider][experiment]
            if not counts:
                continue

            analysis_data.append({
                "provider": provider,
                "experiment": experiment,
                "mean_components": np.mean(counts),
                "std_dev_components": np.std(counts),
                "min_components": min(counts),
                "max_components": max(counts),
                "test_case_count": len(counts),
                "raw_counts": counts  # Keep the raw data for potential deeper analysis
            })
            
    # --- 3. Create DataFrame and Save ---
    if not analysis_data:
        print("Analysis yielded no data to save.")
        return

    # Create the DataFrame
    df = pd.DataFrame(analysis_data)
    
    # Define column order for better presentation
    column_order = [
        "provider", "experiment", "mean_components", "std_dev_components", 
        "min_components", "max_components", "test_case_count", "raw_counts"
    ]
    df = df[column_order]

    # Set display options for cleaner printing
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.width', 120)
    
    print("\n\n" + "="*80)
    print("📊 Component Generation Analysis (Pandas DataFrame) 📊")
    print("="*80)
    print(df)
    print("="*80)

    # Save the DataFrame to a CSV file
    output_filename = "component_analysis_summary.csv"
    df.to_csv(output_filename, index=False)
    
    print(f"\n✅ Successfully saved the analysis to '{output_filename}'")


if __name__ == '__main__':
    # You might need to run: pip install pandas numpy
    analyze_and_save_to_pandas()

🚀 Starting analysis... Will save results to a pandas DataFrame.
Found 60 artifact directories to analyze.


📊 Component Generation Analysis (Pandas DataFrame) 📊
   provider                            experiment  mean_components  std_dev_components  min_components  \
0    gemini  Gemini15Flash_Collaborative_1_Rounds              1.8            1.600000               0   
1    gemini  Gemini15Flash_Collaborative_3_Rounds              2.8            2.039608               0   
2    gemini                  Gemini15Flash_Simple              5.4            1.356466               4   
3      grok      Grok3mini_Collaborative_1_Rounds              5.2            0.400000               5   
4      grok      Grok3mini_Collaborative_3_Rounds              5.4            0.489898               5   
5      grok                      Grok3mini_Simple              4.6            0.489898               4   
6    openai          GPT4o_Collaborative_1_Rounds              7.2            1.720465           

In [91]:
import os
import glob
import numpy as np
import pandas as pd
from collections import defaultdict

def generate_llm_readable_table(base_search_path: str = "evaluation_results_*"):
    """
    Analyzes C4 artifacts and displays the results as a clean,
    LLM-readable Markdown table.

    Args:
        base_search_path (str): The base pattern to find provider directories.
    """
    print("🚀 Analyzing artifacts to generate an LLM-readable table...")

    # --- 1. Data Collection (Same as previous scripts) ---
    experiment_counts = defaultdict(lambda: defaultdict(list))
    search_pattern = os.path.join(base_search_path, "**", "c4_artifacts")
    artifact_dirs = glob.glob(search_pattern, recursive=True)

    if not artifact_dirs:
        print("❌ No 'c4_artifacts' directories found.")
        return

    for artifact_dir in artifact_dirs:
        try:
            path_parts = artifact_dir.split(os.sep)
            provider = path_parts[0].replace("evaluation_results_", "")
            experiment = path_parts[1]
            components_dir = os.path.join(artifact_dir, '3_components')
            component_count = 0
            if os.path.isdir(components_dir):
                definition_files = glob.glob(os.path.join(components_dir, '*_definition.yaml'))
                component_count = len(definition_files)
            experiment_counts[provider][experiment].append(component_count)
        except IndexError:
            print(f"⚠️ Could not parse path: {artifact_dir}. Skipping.")

    # --- 2. Process data for DataFrame (Same as before) ---
    analysis_data = []
    if not experiment_counts:
        return

    for provider in sorted(experiment_counts.keys()):
        for experiment in sorted(experiment_counts[provider].keys()):
            counts = experiment_counts[provider][experiment]
            if not counts: continue
            analysis_data.append({
                "provider": provider,
                "experiment": experiment,
                "mean_components": np.mean(counts),
                "min_components": min(counts),
                "max_components": max(counts),
            })

    if not analysis_data:
        print("Analysis yielded no data to display.")
        return

    # Create DataFrame
    df = pd.DataFrame(analysis_data)
    # Round the mean for cleaner display
    df['mean_components'] = df['mean_components'].round(2)

    # --- 3. Generate and Display LLM-Readable Table ---
    # Select only the columns you requested
    table_df = df[['provider', 'experiment', 'mean_components', 'min_components', 'max_components']]

    # Use the to_markdown() function to generate the table
    markdown_table = table_df.to_markdown(index=False)
    
    print("\n\n" + "="*80)
    print("📋 LLM-Readable Component Analysis (Markdown Format) 📋")
    print("="*80)
    print("You can copy and paste the table below directly into another LLM prompt.")
    print("-"*(80))
    print(markdown_table)
    print("-"*(80))


if __name__ == '__main__':
    # You might need to run: pip install pandas numpy tabulate
    # The 'to_markdown' function requires the 'tabulate' library.
    generate_llm_readable_table()

🚀 Analyzing artifacts to generate an LLM-readable table...


📋 LLM-Readable Component Analysis (Markdown Format) 📋
You can copy and paste the table below directly into another LLM prompt.
--------------------------------------------------------------------------------
| provider   | experiment                           |   mean_components |   min_components |   max_components |
|:-----------|:-------------------------------------|------------------:|-----------------:|-----------------:|
| gemini     | Gemini15Flash_Collaborative_1_Rounds |               1.8 |                0 |                4 |
| gemini     | Gemini15Flash_Collaborative_3_Rounds |               2.8 |                0 |                5 |
| gemini     | Gemini15Flash_Simple                 |               5.4 |                4 |                8 |
| grok       | Grok3mini_Collaborative_1_Rounds     |               5.2 |                5 |                6 |
| grok       | Grok3mini_Collaborative_3_Rounds     |      