In [1]:
from main import load_alert_from_file
from llms.gemini import create_gemini_client
from utils.tools_registry import get_analysis_tools
from langchain_mcp_adapters.client import MultiServerMCPClient


alert_file = "examples/alerts/etcd-down.json"
alert_data = load_alert_from_file(alert_file)

In [2]:
llm = create_gemini_client()
mcp_client = MultiServerMCPClient(
    {
        "kubectl-ai": {
            "command": "kubectl-ai",
            "args": ["--mcp-server"],
            "transport": "stdio",
        },
    }
)
tools_mcp = await mcp_client.get_tools()
tools = tools_mcp + get_analysis_tools()

In [3]:
from pprint import pprint

pprint(tools)

[StructuredTool(name='bash', description='Executes a bash command. Use this tool only when you need to execute a shell command.', args_schema={'type': 'object', 'properties': {'command': {'type': 'string', 'description': 'The bash command to execute.'}, 'modifies_resource': {'type': 'string', 'description': 'Whether the command modifies a kubernetes resource.\nPossible values:\n- "yes" if the command modifies a resource\n- "no" if the command does not modify a resource\n- "unknown" if the command\'s effect on the resource is unknown\n'}}}, metadata={'title': None, 'readOnlyHint': None, 'destructiveHint': None, 'idempotentHint': None, 'openWorldHint': None}, response_format='content_and_artifact', coroutine=<function convert_mcp_tool_to_langchain_tool.<locals>.call_tool at 0x100f22480>),
 StructuredTool(name='kubectl', description="Executes a kubectl command against the user's Kubernetes cluster. Use this tool only when you need to query or modify the state of the user's Kubernetes clus

In [4]:
from agents.analyst_agent import AnalystAgent
from agents.planner_agent import PlannerAgent
from agents.executor_agent import ExecutorAgent

# Initialize agents
analyst_agent = AnalystAgent(llm=llm, tools=tools, debug=True)
planner_agent = PlannerAgent(llm=llm, tools=tools, debug=True)
executor_agent = ExecutorAgent(llm=llm, tools=tools, debug=True)

In [5]:
from prompts.analyst_prompt import ANALYST_HUMAN_PROMPT
import json

human_prompt = ANALYST_HUMAN_PROMPT.format(
    alert_data=json.dumps(alert_data, indent=2, ensure_ascii=False)
)

In [6]:
print(human_prompt)


**INCIDENT ALERT FOR DIAGNOSIS:**
{
  "receiver": "team-etcd-pager",
  "status": "firing",
  "alerts": [
    {
      "status": "firing",
      "labels": {
        "alertname": "EtcdNodeDown",
        "instance": "etcd-0.example.com:2379",
        "severity": "critical",
        "cluster": "production-etcd-cluster"
      },
      "annotations": {
        "summary": "Etcd node etcd-0.example.com is down or unreachable",
        "description": "The etcd member etcd-0.example.com (instance: etcd-0.example.com:2379) is not reachable or has stopped responding to health checks for more than 1 minute. This indicates a potential issue with the etcd cluster's quorum."
      },
      "startsAt": "2025-07-06T13:30:00Z",
      "generatorURL": "http://prometheus.example.com/graph?g0.expr=up%7Betcd%3D%22true%22%2Cinstance%3D%22etcd-0.example.com%3A2379%22%7D%3D0"
    }
  ],
  "groupLabels": {
    "alertname": "EtcdNodeDown",
    "cluster": "production-etcd-cluster"
  },
  "commonLabels": {
    "aler

In [7]:
# Step 1: Run analysis
print("🔍 Running analysis...")
result = analyst_agent.agent.invoke(
    {"messages": [{"role": "user", "content": human_prompt}]}
)
print("Analysis Result:")

🔍 Running analysis...
[36;1m[1;3m[-1:checkpoint][0m [1mState at the end of step -1:
[0m{'messages': []}
[36;1m[1;3m[0:tasks][0m [1mStarting 1 task for step 0:
[0m- [32;1m[1;3m__start__[0m -> {'messages': [{'content': '\n'
                          '**INCIDENT ALERT FOR DIAGNOSIS:**\n'
                          '{\n'
                          '  "receiver": "team-etcd-pager",\n'
                          '  "status": "firing",\n'
                          '  "alerts": [\n'
                          '    {\n'
                          '      "status": "firing",\n'
                          '      "labels": {\n'
                          '        "alertname": "EtcdNodeDown",\n'
                          '        "instance": "etcd-0.example.com:2379",\n'
                          '        "severity": "critical",\n'
                          '        "cluster": "production-etcd-cluster"\n'
                          '      },\n'
                          '      "annotations": {\

In [8]:
import json

# Extract the content from the last AI message
content = result["messages"][-1].content

# Remove code block markers if present
if content.startswith("```json"):
    content = content[len("```json") :].strip()
if content.endswith("```"):
    content = content[:-3].strip()

# Parse the JSON
parsed_result = json.loads(content)
print(json.dumps(parsed_result, indent=2))

{
  "root_cause": "etcd-0.example.com node is unreachable or has failed. This impacts the etcd cluster's quorum, potentially leading to Kubernetes control plane unavailability.",
  "confidence_level": "High",
  "severity": "Critical",
  "affected_components": [
    "etcd-0.example.com",
    "Kubernetes control plane (potentially)"
  ],
  "investigation_summary": "The alert indicates that the etcd member etcd-0.example.com is down or unreachable.  This is a critical issue as etcd stores the Kubernetes cluster state.  The following steps were taken:\n\n1. **Initial Assessment:** Checked the overall cluster health using `kubectl cluster-info` and `kubectl get nodes -o wide`.  This would reveal if other nodes are also affected.\n2. **etcd Status:** Executed `kubectl get pods -n kube-system | grep etcd` to check the status of etcd pods.  A non-running or crashing pod would confirm the alert.\n3. **Node Status:**  Used `kubectl describe node etcd-0.example.com` to examine the node's status, 

In [9]:
import json

print(json.dumps(alert_data, indent=2, ensure_ascii=False))

{
  "receiver": "team-etcd-pager",
  "status": "firing",
  "alerts": [
    {
      "status": "firing",
      "labels": {
        "alertname": "EtcdNodeDown",
        "instance": "etcd-0.example.com:2379",
        "severity": "critical",
        "cluster": "production-etcd-cluster"
      },
      "annotations": {
        "summary": "Etcd node etcd-0.example.com is down or unreachable",
        "description": "The etcd member etcd-0.example.com (instance: etcd-0.example.com:2379) is not reachable or has stopped responding to health checks for more than 1 minute. This indicates a potential issue with the etcd cluster's quorum."
      },
      "startsAt": "2025-07-06T13:30:00Z",
      "generatorURL": "http://prometheus.example.com/graph?g0.expr=up%7Betcd%3D%22true%22%2Cinstance%3D%22etcd-0.example.com%3A2379%22%7D%3D0"
    }
  ],
  "groupLabels": {
    "alertname": "EtcdNodeDown",
    "cluster": "production-etcd-cluster"
  },
  "commonLabels": {
    "alertname": "EtcdNodeDown",
    "severi

In [10]:
from utils.slack_service import SlackService


slack_service = SlackService()
slack_service.send_analysis_result(alert_data=alert_data, analysis_result=parsed_result)

'1752050489.086459'