diff --git a/aql_agent.py b/aql_agent.py index 90598f3..3e29804 100644 --- a/aql_agent.py +++ b/aql_agent.py @@ -38,11 +38,9 @@ - # Tools for each specialized agent AQL_QUERY_TOOLS = [text_to_aql_to_text] - class AgentState(TypedDict): messages: list context: Dict @@ -58,6 +56,72 @@ def create_llm(): ) +def create_judge_llm(): + """Create a Judge LLM for evaluating the agent responses""" + return ChatOpenAI( + model="gpt-4o-mini", + temperature=0.1 + ) + + +def evaluate_with_judge(response: str, expected_answer: str): + """ + Evaluates the agent's response using Judge LLM. + + Args: + response: The output generated by the agent. + expected_answer: The correct or expected answer for comparison. + + Returns: + Evaluation result (e.g., score, feedback). + """ + + # Define the evaluation prompt for the Judge LLM + evaluation_prompt = f""" + You are an expert medical data evaluator specializing in ArangoDB graph queries and their medical interpretations. Your task is to evaluate the response generated by an AI agent that converts a medical question into an AQL query, executes it, and interprets the results. + + Please follow these guidelines: + + 1. **AQL Query Validation**: + - Check if the agent correctly translated the natural language question into an AQL query. Was the AQL query appropriate for the task? + - Ensure the query is syntactically correct and uses the correct database schema and field names. + - Was the query optimized (e.g., no unnecessary joins or filtering) to retrieve the correct data efficiently? + + 2. **Medical Accuracy**: + - Ensure the interpretation of the query result is accurate from a medical standpoint. + - Did the agent provide a correct and medically meaningful interpretation of the query result (e.g., average age, medical conditions, etc.)? + - Was the response medically relevant and easy to understand for healthcare professionals? + + 3. **Clarity and Structure**: + - Was the response clear and structured, following proper medical terminology? + - Did the agent provide adequate context or explanations to ensure a healthcare professional would understand the results? + - Did the agent avoid jargon that might confuse non-technical medical users? + + 4. **Feedback**: + - If the agent’s response is incorrect or incomplete, provide detailed feedback on what went wrong. + - Suggest improvements in terms of both query formulation and interpretation. + + 5. **Scoring**: + - Provide a score between 1 and 10, where 10 means the response was perfect, and 1 means the response was completely incorrect or misleading. + + Please provide the score, and a detailed explanation for your evaluation. + + --- + + Agent's Response: {response} + Expected Answer: {expected_answer} + """ + + judge_llm = create_judge_llm() + + # Get the evaluation result from the Judge LLM + evaluation_result = judge_llm.invoke({"messages": [("human", evaluation_prompt)]}) + + if evaluation_result["messages"]: + return evaluation_result["messages"][-1].content + else: + return "No evaluation generated" + def create_agent(state, memory, llm: ChatOpenAI, tools: list, system_prompt: str, max_iterations: int = 2, max_execution_time: int = 120) -> AgentExecutor: """ @@ -91,6 +155,7 @@ def _modify_state_messages(state, memory): return agent + system_prompt = f""" You are an expert medical data analyst specializing in ArangoDB graph queries. @@ -117,8 +182,6 @@ def aql_query_node(state): Handles fundamental analysis and financial metrics using tools from tools.py """ - - try: aql_agent = create_agent( llm, @@ -170,10 +233,11 @@ def aql_query_node(state): def run_aql_agent(state, memory, question: str, current_date: str = None): """ - Runs the financial agent with the given question. + Runs the financial agent with the given question and evaluates the result. Args: question: The user's question + expected_answer: The correct or expected answer for comparison current_date: Optional date context """ # Initialize state with datetime object @@ -189,7 +253,7 @@ def run_aql_agent(state, memory, question: str, current_date: str = None): "aql_query_agent_internal_state": {} } - # Basic usage + # Run the agent try: # Create agent directly like in the other methods aql_agent = create_agent( @@ -202,18 +266,22 @@ def run_aql_agent(state, memory, question: str, current_date: str = None): max_execution_time=120 ) - # Use invoke instead of aql_query_node result = aql_agent.invoke( {"messages": initial_state["messages"]}, - {"callbacks": [initial_state["callback"]]} + {"callbacks": [initial_state["callback"]]}, ) memory.add(f"User: {state["messages"][-1]}\nAssistant: {result}", user_id=state["mem0_user_id"], agent_id=state["agent_id"]) print("memory -:", memory.get_all(user_id=state["mem0_user_id"], agent_id=state["agent_id"])) # Get the last message content if result["messages"] and len(result["messages"]) > 0: - return result["messages"][-1].content + agent_response = result["messages"][-1].content else: - return "No response generated" + agent_response = "No response generated" + + # Evaluate the response using Judge LLM + evaluation = evaluate_with_judge(agent_response, expected_answer) + + return agent_response, evaluation except Exception as e: return f"Error running agent: {str(e)}" @@ -316,20 +384,10 @@ async def run_aql_agent_async(question: str, current_date: str = None, timeout: # Example usage: if __name__ == "__main__": - # Basic usage question = "What is the average age of patients in the database?" - result = run_aql_agent(question) - print(f"Basic result: {result}") - - #Streaming usage - print("\nStreaming results:") - for chunk in run_aql_agent_with_stream(question): - print(f"Chunk: {chunk}") + expected_answer = "The average age of patients in the database is 45." # Replace with the correct expected answer - # Async usage with timeout - print("\nAsync result:") - async def main(): - result = await run_aql_agent_async(question, timeout=300) - print(f"Async result: {result}") + agent_response, evaluation = run_aql_agent(question, expected_answer) - asyncio.run(main()) \ No newline at end of file + print(f"Agent's Response: {agent_response}") + print(f"Evaluation: {evaluation}") diff --git a/graph_agent.py b/graph_agent.py index 5c99b72..d1ac697 100644 --- a/graph_agent.py +++ b/graph_agent.py @@ -156,20 +156,10 @@ def graph_analysis_node(state): return state -def run_graph_analysis_agent(question: str, current_date: str = None): +def run_graph_analysis_agent(question: str): """ - Runs the financial agent with the given question. - - Args: - question: The user's question - current_date: Optional date context + Runs the graph analysis agent with the given question. """ - # Initialize state with datetime object - if current_date and isinstance(current_date, str): - current_date = datetime.strptime(current_date, "%Y-%m-%d") - else: - current_date = datetime.now() - initial_state = { "messages": [("human", question)], "user_input": question, @@ -177,31 +167,16 @@ def run_graph_analysis_agent(question: str, current_date: str = None): "graph_analysis_agent_internal_state": {} } - # Basic usage - try: - # Create agent directly like in the other methods - graph_analysis_agent = create_agent( - llm, - GRAPH_ANALYSIS_TOOLS, - system_prompt, - max_iterations=2, - max_execution_time=120 - ) - - # Use invoke instead of aql_query_node - result = graph_analysis_agent.invoke( - {"messages": initial_state["messages"]}, - {"callbacks": [initial_state["callback"]]} - ) - - # Get the last message content - if result["messages"] and len(result["messages"]) > 0: - return result["messages"][-1].content - else: - return "No response generated" - - except Exception as e: - return f"Error running agent: {str(e)}" + graph_analysis_agent = create_agent(llm, GRAPH_ANALYSIS_TOOLS, system_prompt) + result = graph_analysis_agent.invoke( + {"messages": initial_state["messages"]}, + {"callbacks": [initial_state["callback"]]} + ) + response_text = result["messages"][-1].content if result["messages"] else "No response generated" + + evaluation = evaluate_response(question, response_text) + + return response_text, evaluation def run_graph_analysis_agent_with_stream(question: str, current_date: str = None): """ @@ -295,13 +270,50 @@ async def run_graph_analysis_agent_async(question: str, current_date: str = None except Exception as e: return f"Error running agent: {str(e)}" +def create_judge_llm(): + """Create the judge LLM for evaluation""" + return ChatOpenAI( + model="gpt-4o-mini", + temperature=0.1 + ) + +def evaluate_response(question: str, agent_response: str): + """Evaluates the agent's response using a Judge LLM.""" + judge_prompt = f""" + You are an expert evaluator assessing the quality of responses given by a Graph Analysis AI Agent. + Your task is to evaluate the response based on: + 1. **Correctness**: Is the response factually accurate based on graph theory? + 2. **Relevance**: Does it directly answer the given question? + 3. **Clarity**: Is the response well-structured and easy to understand? + + Here is the question: + "{question}" + + Here is the agent's response: + "{agent_response}" + + Provide a structured assessment in JSON format with the following keys: + - correctness (score out of 10) + - relevance (score out of 10) + - clarity (score out of 10) + - feedback (brief textual feedback on improvement) + """ + + judge_llm = create_judge_llm() + evaluation = judge_llm.invoke(judge_prompt) + return evaluation.content + -# Example usage: if __name__ == "__main__": - # Basic usage question = "find node with highest betweenness centrality in the graph" - result = run_graph_analysis_agent(question) - print(f"Basic result: {result}") + response, evaluation = run_graph_analysis_agent(question) + + print("Agent Response:") + print(response) + + print("\nEvaluation:") + print(evaluation) + #Streaming usage print("\nStreaming results:")