In [7]:
import random  
import json  
from typing import Dict, List, Any  
from autogen import ConversableAgent, GroupChat, GroupChatManager, LLMConfig  
  
# LLM配置  
llm_config = LLMConfig(api_type="ollama", model="llama3.1")

In [8]:
# 模拟故障数据  
SIMULATED_FAULT_DATA = {  
    "logs": """  
2024-06-03 10:15:23 ERROR [UserService] Database connection timeout after 30s  
2024-06-03 10:15:24 WARN [OrderService] Retry attempt 3/3 failed for order #12345  
2024-06-03 10:15:25 ERROR [PaymentService] Payment gateway returned 500 Internal Server Error  
2024-06-03 10:15:26 ERROR [UserService] Connection pool exhausted, max connections: 100  
2024-06-03 10:15:27 FATAL [OrderService] Unable to process order queue, backing off for 60s  
2024-06-03 10:15:28 ERROR [NotificationService] Failed to send email notification: SMTP timeout  
2024-06-03 10:15:30 ERROR [UserService] Database connection timeout after 30s  
2024-06-03 10:15:32 ERROR [PaymentService] Payment processing failed for transaction #67890  
""",  
      
    "traces": {  
        "trace_id": "abc123-def456-ghi789",  
        "spans": [  
            {  
                "service": "api-gateway",  
                "operation": "POST /api/orders",  
                "duration_ms": 5000,  
                "status": "ERROR",  
                "error": "Downstream service timeout"  
            },  
            {  
                "service": "order-service",   
                "operation": "create_order",  
                "duration_ms": 4800,  
                "status": "ERROR",  
                "error": "Database connection failed"  
            },  
            {  
                "service": "user-service",  
                "operation": "validate_user",   
                "duration_ms": 30000,  
                "status": "TIMEOUT",  
                "error": "Connection timeout"  
            },  
            {  
                "service": "payment-service",  
                "operation": "process_payment",  
                "duration_ms": 2000,  
                "status": "ERROR",   
                "error": "Gateway unavailable"  
            }  
        ]  
    },  
      
    "metrics": {  
        "cpu_usage": [85, 92, 88, 95, 89, 91, 87],  # 过去7分钟的CPU使用率  
        "memory_usage": [78, 82, 85, 88, 90, 92, 89],  # 内存使用率  
        "response_time_ms": [1200, 1800, 2500, 3200, 4100, 5000, 4800],  # 响应时间  
        "error_rate": [0.02, 0.05, 0.08, 0.12, 0.15, 0.18, 0.16],  # 错误率  
        "database_connections": [95, 98, 100, 100, 100, 100, 98],  # 数据库连接数  
        "queue_depth": [50, 120, 200, 350, 500, 480, 420]  # 消息队列深度  
    }  
}  

# 全局上下文用于维护分析过程  
analysis_context = {  
    "log_analysis": None,  
    "trace_analysis": None,   
    "metrics_analysis": None,  
    "review_results": {},  
    "final_report": None  
}  
  
  
# 工具函数实现  
def process_log_data(log_data: str) -> Dict[str, Any]:  
    """处理日志数据，提取错误模式"""  
    error_count = log_data.count("ERROR")  
    fatal_count = log_data.count("FATAL")   
    timeout_count = log_data.count("timeout")  
      
    return {  
        "total_errors": error_count,  
        "fatal_errors": fatal_count,  
        "timeout_errors": timeout_count,  
        "main_issues": ["Database connection timeout", "Payment gateway errors", "Connection pool exhaustion"],  
        "severity": "HIGH" if fatal_count > 0 else "MEDIUM"  
    }  
  
def process_trace_data(trace_data: Dict[str, Any]) -> Dict[str, Any]:  
    """处理调用链数据"""  
    total_duration = sum(span["duration_ms"] for span in trace_data["spans"])  
    error_services = [span["service"] for span in trace_data["spans"] if span["status"] == "ERROR"]  
      
    return {  
        "total_duration_ms": total_duration,  
        "failed_services": error_services,  
        "bottleneck_service": "user-service",  
        "critical_path_issues": ["Database connectivity", "Service timeouts"],  
        "performance_impact": "SEVERE"  
    }  
  
def process_metrics_data(metrics_data: Dict[str, Any]) -> Dict[str, Any]:  
    """处理指标数据"""  
    avg_cpu = sum(metrics_data["cpu_usage"]) / len(metrics_data["cpu_usage"])  
    max_response_time = max(metrics_data["response_time_ms"])  
    current_error_rate = metrics_data["error_rate"][-1]  
      
    return {  
        "avg_cpu_usage": avg_cpu,  
        "peak_response_time_ms": max_response_time,   
        "current_error_rate": current_error_rate,  
        "resource_exhaustion": "Database connection pool at 100%",  
        "performance_degradation": "Response time increased 4x",  
        "trend": "DETERIORATING"  
    }  

In [9]:
# 测试上面的函数
def test_analysis_functions():  
    log_analysis = process_log_data(SIMULATED_FAULT_DATA["logs"])  
    trace_analysis = process_trace_data(SIMULATED_FAULT_DATA["traces"])  
    metrics_analysis = process_metrics_data(SIMULATED_FAULT_DATA["metrics"])  
    
    print("Log Analysis:", json.dumps(log_analysis, indent=2))  
    print("Trace Analysis:", json.dumps(trace_analysis, indent=2))  
    print("Metrics Analysis:", json.dumps(metrics_analysis, indent=2))

test_analysis_functions()

Log Analysis: {
  "total_errors": 6,
  "fatal_errors": 1,
  "timeout_errors": 3,
  "main_issues": [
    "Database connection timeout",
    "Payment gateway errors",
    "Connection pool exhaustion"
  ],
  "severity": "HIGH"
}
Trace Analysis: {
  "total_duration_ms": 41800,
  "failed_services": [
    "api-gateway",
    "order-service",
    "payment-service"
  ],
  "bottleneck_service": "user-service",
  "critical_path_issues": [
    "Database connectivity",
    "Service timeouts"
  ],
  "performance_impact": "SEVERE"
}
Metrics Analysis: {
  "avg_cpu_usage": 89.57142857142857,
  "peak_response_time_ms": 5000,
  "current_error_rate": 0.16,
  "resource_exhaustion": "Database connection pool at 100%",
  "performance_degradation": "Response time increased 4x",
  "trend": "DETERIORATING"
}


In [10]:
# 创建智能体，将前面的工具函数集成到智能体中 
def create_agents():  
    """创建所有智能体"""  
    with llm_config:  
        # 日志分析智能体  
        log_agent = ConversableAgent(  
            name="log_analyzer",  
            system_message="""你是日志分析专家，负责分析系统日志数据，识别异常模式和错误信息。  
            请分析提供的日志数据，识别关键错误模式，评估严重程度，并提供分析结论。  
            完成分析后输出'LOG_ANALYSIS_COMPLETE'。""",  
            description="专门分析系统日志数据的专家",
            functions=[process_log_data] 
        )  
          
        # 调用链分析智能体  
        trace_agent = ConversableAgent(  
            name="trace_analyzer",   
            system_message="""你是分布式追踪专家，负责分析调用链数据，识别性能瓶颈和异常调用。  
            请分析提供的调用链数据，识别性能瓶颈，分析服务间依赖关系，并提供优化建议。  
            完成分析后输出'TRACE_ANALYSIS_COMPLETE'。""",  
            description="专门分析调用链数据的专家",
            functions=[process_trace_data]
        )  
          
        # 指标分析智能体  
        metrics_agent = ConversableAgent(  
            name="metrics_analyzer",  
            system_message="""你是系统指标专家，负责分析监控指标，识别异常趋势和阈值违规。  
            请分析提供的系统指标数据，识别异常趋势，评估系统健康状况，并提供告警建议。  
            完成分析后输出'METRICS_ANALYSIS_COMPLETE'。""",  
            description="专门分析系统指标的专家",
            functions=[process_metrics_data]
        )  
          
        # 进度管理智能体  
        progress_agent = ConversableAgent(  
            name="progress_manager",  
            system_message="""你负责协调整个故障检测分析流程，确保各个阶段按序进行，并组织评审投票。  
            你需要：1) 启动各个分析智能体的工作 2) 协调评审流程 3) 确保流程顺利推进""",  
            description="协调整个故障检测流程的管理者"  
        )  
          
        # 报告生成智能体  
        report_agent = ConversableAgent(  
            name="report_generator",  
            system_message="""你负责根据所有分析结果生成最终的故障分析报告。  
            请综合日志、调用链、指标的分析结果，生成结构化的故障分析报告，包括：  
            1) 故障概述 2) 根因分析 3) 影响评估 4) 解决方案建议""",  
            description="生成最终分析报告的专家"  
        )  
      
    return log_agent, trace_agent, metrics_agent, progress_agent, report_agent  
  
def create_review_groupchat(analyzer_agent: ConversableAgent, all_agents: List[ConversableAgent], analysis_result: Dict[str, Any]):  
    """从所有智能体中随机选择3个组成评审团"""  
      
    # 排除当前分析的智能体，从剩余智能体中随机选择3个  
    available_agents = [agent for agent in all_agents if agent != analyzer_agent]  
    selected_reviewers = random.sample(available_agents, min(3, len(available_agents)))  
      
    # 为评审创建临时智能体副本  
    reviewers = []  
    for agent in selected_reviewers:  
        reviewer = ConversableAgent(  
            name=f"{agent.name}_reviewer",  
            system_message=f"""你现在作为评审专家，需要审查{analyzer_agent.name}的分析结果。  
            请仔细评估分析过程和结论的正确性、完整性和合理性。  
              
            评审标准：  
            1) 分析逻辑是否清晰合理  
            2) 结论是否有充分依据  
            3) 是否遗漏重要信息  
            4) 建议是否可行  
              
            如果认为分析正确且完整，回复'APPROVE'；  
            如果认为需要重新分析或补充，回复'REJECT'并详细说明原因。  
            最后必须明确表态：APPROVE 或 REJECT""",  
            llm_config=llm_config  
        )  
        reviewers.append(reviewer)  
      
    return reviewers  
  
def conduct_review_voting(analyzer_agent: ConversableAgent, all_agents: List[ConversableAgent], analysis_result: Dict[str, Any]) -> bool:  
    """执行评审投票流程"""  
    print(f"\n=== 开始评审 {analyzer_agent.name} 的分析结果 ===")  
      
    # 创建评审团  
    reviewers = create_review_groupchat(analyzer_agent, all_agents, analysis_result)  
      
    # 创建评审群聊  
    review_groupchat = GroupChat(  
        agents=reviewers,  
        messages=[],  
        max_round=len(reviewers) + 2,  
        speaker_selection_method="round_robin"  
    )  
      
    review_manager = GroupChatManager(  
        name="review_manager",  
        groupchat=review_groupchat,   
        llm_config=llm_config  
    )  
      
    # 启动评审投票  
    review_prompt = f"""  
    请评审{analyzer_agent.name}的分析结果：  
      
    分析结果：  
    {json.dumps(analysis_result, indent=2, ensure_ascii=False)}  
      
    每位评审者请基于评审标准进行评估，并明确投票：APPROVE 或 REJECT  
    如果REJECT，请详细说明需要改进的地方。  
    """  
      
    # 开始评审群聊  
    if reviewers:  
        reviewers[0].initiate_chat(  
            recipient=review_manager,  
            message=review_prompt  
        )  
          
        # 统计投票结果  
        approve_count = 0  
        reject_count = 0  
          
        for message in review_groupchat.messages:  
            content = message.get("content", "").upper()  
            if "APPROVE" in content:  
                approve_count += 1  
            elif "REJECT" in content:  
                reject_count += 1  
          
        print(f"投票结果: APPROVE={approve_count}, REJECT={reject_count}")  
        return approve_count >= 2  # 2票及以上通过  
      
    return False

In [11]:
def run_fault_detection_system():  
    """运行完整的故障检测系统"""  
    print("=== 启动软件故障检测系统 ===")  
      
    # 创建所有智能体  
    log_agent, trace_agent, metrics_agent, progress_agent, report_agent = create_agents()  
    all_agents = [log_agent, trace_agent, metrics_agent, progress_agent, report_agent]  
    analysis_agents = [log_agent, trace_agent, metrics_agent]  

    # 数据映射
    data_mapping = {
        "log_analyzer": "logs",
        "trace_analyzer": "traces",
        "metrics_analyzer": "metrics"
    }

    print("\n=== 第一阶段：数据分析 ===")  
      
    # 处理每个分析智能体的分析任务  
    for analyzer in analysis_agents:  
        print(f"\n--- {analyzer.name} 开始分析 ---")  
          
        # 创建分析群聊  
        analysis_groupchat = GroupChat(  
            agents=[analyzer],  
            messages=[],  
            max_round=5,  
            speaker_selection_method="auto"  
        )  
          
        analysis_manager = GroupChatManager(  
            name=f"{analyzer.name}_manager",  
            groupchat=analysis_groupchat,  
            llm_config=llm_config,  
            is_termination_msg=lambda x: "ANALYSIS_COMPLETE" in (x.get("content", "") or "").upper()  
        )  
          
        # 启动分析智能体  
        analysis_prompt = f"""  
        请分析以下数据：  
        {json.dumps(SIMULATED_FAULT_DATA[data_mapping[analyzer.name]], indent=2, ensure_ascii=False)}  

        分析完成后请输出'ANALYSIS_COMPLETE'。  
        """  

        analyzer.initiate_chat(  
            recipient=analysis_manager,  
            message=analysis_prompt  
        )  
          
        print(f"=== {analyzer.name} 分析完成 ===")  
          
        # 获取分析结果  
        analysis_result = analysis_manager.groupchat.messages[-1].get("content", "")  
        analysis_context[f"{analyzer.name}_result"] = analysis_result  
          
        # 进行评审投票  
        max_retries = 2  
        retry_count = 0  
          
        while retry_count < max_retries:  
            is_approved = conduct_review_voting(analyzer, all_agents, analysis_result)  
              
            if is_approved:  
                print(f"✅ {analyzer.name} 分析通过评审")  
                analysis_context["review_results"][analyzer.name] = "approved"  
                break  
            else:  
                retry_count += 1  
                print(f"❌ {analyzer.name} 分析未通过评审 (第{retry_count}次)")  
                if retry_count < max_retries:  
                    print(f"要求 {analyzer.name} 重新分析...")  
                else:  
                    print(f"⚠️ {analyzer.name} 达到最大重试次数，跳过")  
                    analysis_context["review_results"][analyzer.name] = "failed"  
      
    print("\n=== 第二阶段：生成最终报告 ===")  
      
    # 检查是否所有分析都通过评审  
    approved_analyses = [name for name, status in analysis_context["review_results"].items()   
                        if status == "approved"]  
      
    if len(approved_analyses) >= 2:  # 至少2个分析通过  
        print("启动报告智能体生成最终分析报告...")  
          
        # 创建报告生成群聊  
        report_groupchat = GroupChat(  
            agents=[report_agent],  
            messages=[],  
            max_round=5,  
            speaker_selection_method="auto"  
        )  
          
        report_manager = GroupChatManager(  
            name="report_manager",  
            groupchat=report_groupchat,  
            llm_config=llm_config,  
            is_termination_msg=lambda x: "FINAL_REPORT_COMPLETE" in (x.get("content", "") or "").upper()  
        )  
          
        # 准备分析结果摘要给报告智能体  
        approved_results = {}  
        for key, value in analysis_context.items():  
            if key.endswith("_result") and value is not None:  
                analyzer_name = key.replace("_result", "")  
                if analysis_context["review_results"].get(analyzer_name) == "approved":  
                    approved_results[analyzer_name] = value  
          
        report_prompt = f"""  
        请根据以下已通过评审的分析结果生成最终的软件系统故障分析报告：  
          
        通过评审的分析结果：  
        {json.dumps(approved_results, indent=2, ensure_ascii=False)}  
          
        请生成结构化的故障分析报告，包括：  
        1) 故障概述  
        2) 根因分析    
        3) 影响评估  
        4) 解决方案建议  
          
        完成报告后请输出'FINAL_REPORT_COMPLETE'。  
        """  
          
        report_agent.initiate_chat(  
            recipient=report_manager,  
            message=report_prompt  
        )  
          
        print("=== 报告智能体已完成最终分析报告 ===")  
    else:  
        print("❌ 分析失败：通过评审的分析不足，无法生成可靠的故障报告") 

In [12]:
# 主程序入口  
if __name__ == "__main__":  
    try:  
        run_fault_detection_system()  
    except Exception as e:  
        print(f"系统运行出错: {e}")  
        import traceback  
        traceback.print_exc()

=== 启动软件故障检测系统 ===

=== 第一阶段：数据分析 ===

--- log_analyzer 开始分析 ---
[33mlog_analyzer[0m (to log_analyzer_manager):

  
        请分析以下数据：  
        "  \n2024-06-03 10:15:23 ERROR [UserService] Database connection timeout after 30s  \n2024-06-03 10:15:24 WARN [OrderService] Retry attempt 3/3 failed for order #12345  \n2024-06-03 10:15:25 ERROR [PaymentService] Payment gateway returned 500 Internal Server Error  \n2024-06-03 10:15:26 ERROR [UserService] Connection pool exhausted, max connections: 100  \n2024-06-03 10:15:27 FATAL [OrderService] Unable to process order queue, backing off for 60s  \n2024-06-03 10:15:28 ERROR [NotificationService] Failed to send email notification: SMTP timeout  \n2024-06-03 10:15:30 ERROR [UserService] Database connection timeout after 30s  \n2024-06-03 10:15:32 ERROR [PaymentService] Payment processing failed for transaction #67890  \n"  

        分析完成后请输出'ANALYSIS_COMPLETE'。  
        

------------------------------------------------------------------------