kashyaprajharsh · shikhar5647 · Mar 4, 2025 · Mar 5, 2025 · Mar 6, 2025
diff --git a/aql_agent.py b/aql_agent.py
@@ -38,11 +38,9 @@
 
 
 
-
 # Tools for each specialized agent
 AQL_QUERY_TOOLS = [text_to_aql_to_text]
 
-
 class AgentState(TypedDict):
     messages: list  
     context: Dict
@@ -58,6 +56,72 @@ def create_llm():
     )
 
 
+def create_judge_llm():
+    """Create a Judge LLM for evaluating the agent responses"""
+    return ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=0.1
+    )
+
+
+def evaluate_with_judge(response: str, expected_answer: str):
+    """
+    Evaluates the agent's response using Judge LLM.
+
+    Args:
+        response: The output generated by the agent.
+        expected_answer: The correct or expected answer for comparison.
+
+    Returns:
+        Evaluation result (e.g., score, feedback).
+    """
+
+    # Define the evaluation prompt for the Judge LLM
+    evaluation_prompt = f"""
+    You are an expert medical data evaluator specializing in ArangoDB graph queries and their medical interpretations. Your task is to evaluate the response generated by an AI agent that converts a medical question into an AQL query, executes it, and interprets the results. 
+
+    Please follow these guidelines:
+
+    1. **AQL Query Validation**:
+        - Check if the agent correctly translated the natural language question into an AQL query. Was the AQL query appropriate for the task?
+        - Ensure the query is syntactically correct and uses the correct database schema and field names.
+        - Was the query optimized (e.g., no unnecessary joins or filtering) to retrieve the correct data efficiently?
+
+    2. **Medical Accuracy**:
+        - Ensure the interpretation of the query result is accurate from a medical standpoint.
+        - Did the agent provide a correct and medically meaningful interpretation of the query result (e.g., average age, medical conditions, etc.)?
+        - Was the response medically relevant and easy to understand for healthcare professionals?
+
+    3. **Clarity and Structure**:
+        - Was the response clear and structured, following proper medical terminology?
+        - Did the agent provide adequate context or explanations to ensure a healthcare professional would understand the results?
+        - Did the agent avoid jargon that might confuse non-technical medical users?
+
+    4. **Feedback**:
+        - If the agent’s response is incorrect or incomplete, provide detailed feedback on what went wrong.
+        - Suggest improvements in terms of both query formulation and interpretation.
+
+    5. **Scoring**:
+        - Provide a score between 1 and 10, where 10 means the response was perfect, and 1 means the response was completely incorrect or misleading.
+
+    Please provide the score, and a detailed explanation for your evaluation.
+
+    ---
+
+    Agent's Response: {response}
+    Expected Answer: {expected_answer}
+    """
+
+    judge_llm = create_judge_llm()
+
+    # Get the evaluation result from the Judge LLM
+    evaluation_result = judge_llm.invoke({"messages": [("human", evaluation_prompt)]})
+
+    if evaluation_result["messages"]:
+        return evaluation_result["messages"][-1].content
+    else:
+        return "No evaluation generated"
+
 
 def create_agent(state, memory, llm: ChatOpenAI, tools: list, system_prompt: str, max_iterations: int = 2, max_execution_time: int = 120) -> AgentExecutor:
     """
@@ -91,6 +155,7 @@ def _modify_state_messages(state, memory):
 
     return agent
 
+
 system_prompt = f"""
 You are an expert medical data analyst specializing in ArangoDB graph queries.
 
@@ -117,8 +182,6 @@ def aql_query_node(state):
     Handles fundamental analysis and financial metrics using tools from tools.py
     """
 
-
-
     try:
         aql_agent = create_agent(
             llm, 
@@ -170,10 +233,11 @@ def aql_query_node(state):
 
 def run_aql_agent(state, memory, question: str, current_date: str = None):
     """
-    Runs the financial agent with the given question.
+    Runs the financial agent with the given question and evaluates the result.
 
     Args:
         question: The user's question
+        expected_answer: The correct or expected answer for comparison
         current_date: Optional date context
     """
     # Initialize state with datetime object
@@ -189,7 +253,7 @@ def run_aql_agent(state, memory, question: str, current_date: str = None):
         "aql_query_agent_internal_state": {}
     }
 
-    # Basic usage
+    # Run the agent
     try:
         # Create agent directly like in the other methods
         aql_agent = create_agent(
@@ -202,18 +266,22 @@ def run_aql_agent(state, memory, question: str, current_date: str = None):
             max_execution_time=120
         )
 
-        # Use invoke instead of aql_query_node
         result = aql_agent.invoke(
             {"messages": initial_state["messages"]},
-            {"callbacks": [initial_state["callback"]]}
+            {"callbacks": [initial_state["callback"]]},
         )
         memory.add(f"User: {state["messages"][-1]}\nAssistant: {result}", user_id=state["mem0_user_id"], agent_id=state["agent_id"])
         print("memory -:", memory.get_all(user_id=state["mem0_user_id"], agent_id=state["agent_id"]))
         # Get the last message content
         if result["messages"] and len(result["messages"]) > 0:
-            return result["messages"][-1].content
+            agent_response = result["messages"][-1].content
         else:
-            return "No response generated"
+            agent_response = "No response generated"
+
+        # Evaluate the response using Judge LLM
+        evaluation = evaluate_with_judge(agent_response, expected_answer)
+
+        return agent_response, evaluation
 
     except Exception as e:
         return f"Error running agent: {str(e)}"
@@ -316,20 +384,10 @@ async def run_aql_agent_async(question: str, current_date: str = None, timeout:
 
 # Example usage:
 if __name__ == "__main__":
-    # Basic usage
     question = "What is the average age of patients in the database?"
-    result = run_aql_agent(question)
-    print(f"Basic result: {result}")
-
-    #Streaming usage
-    print("\nStreaming results:")
-    for chunk in run_aql_agent_with_stream(question):
-        print(f"Chunk: {chunk}")
+    expected_answer = "The average age of patients in the database is 45."  # Replace with the correct expected answer
 
-    # Async usage with timeout
-    print("\nAsync result:")
-    async def main():
-        result = await run_aql_agent_async(question, timeout=300)
-        print(f"Async result: {result}")
+    agent_response, evaluation = run_aql_agent(question, expected_answer)
 
-    asyncio.run(main())
+    print(f"Agent's Response: {agent_response}")
+    print(f"Evaluation: {evaluation}")
diff --git a/graph_agent.py b/graph_agent.py
@@ -156,52 +156,27 @@ def graph_analysis_node(state):
     return state
 
 
-def run_graph_analysis_agent(question: str, current_date: str = None):
+def run_graph_analysis_agent(question: str):
     """
-    Runs the financial agent with the given question.
-
-    Args:
-        question: The user's question
-        current_date: Optional date context
+    Runs the graph analysis agent with the given question.
     """
-    # Initialize state with datetime object
-    if current_date and isinstance(current_date, str):
-        current_date = datetime.strptime(current_date, "%Y-%m-%d")
-    else:
-        current_date = datetime.now()
-
     initial_state = {
         "messages": [("human", question)],
         "user_input": question,
         "callback": CustomConsoleCallbackHandler(),
         "graph_analysis_agent_internal_state": {}
     }
 
-    # Basic usage
-    try:
-        # Create agent directly like in the other methods
-        graph_analysis_agent = create_agent(
-            llm,
-            GRAPH_ANALYSIS_TOOLS,
-            system_prompt,
-            max_iterations=2,
-            max_execution_time=120
-        )
-
-        # Use invoke instead of aql_query_node
-        result = graph_analysis_agent.invoke(
-            {"messages": initial_state["messages"]},
-            {"callbacks": [initial_state["callback"]]}
-        )
-
-        # Get the last message content
-        if result["messages"] and len(result["messages"]) > 0:
-            return result["messages"][-1].content
-        else:
-            return "No response generated"
-
-    except Exception as e:
-        return f"Error running agent: {str(e)}"
+    graph_analysis_agent = create_agent(llm, GRAPH_ANALYSIS_TOOLS, system_prompt)
+    result = graph_analysis_agent.invoke(
+        {"messages": initial_state["messages"]},
+        {"callbacks": [initial_state["callback"]]}
+    )
+    response_text = result["messages"][-1].content if result["messages"] else "No response generated"
+
+    evaluation = evaluate_response(question, response_text)
+
+    return response_text, evaluation
 
 def run_graph_analysis_agent_with_stream(question: str, current_date: str = None):
     """
@@ -295,13 +270,50 @@ async def run_graph_analysis_agent_async(question: str, current_date: str = None
     except Exception as e:
         return f"Error running agent: {str(e)}"
 
+def create_judge_llm():
+    """Create the judge LLM for evaluation"""
+    return ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=0.1
+    )
+
+def evaluate_response(question: str, agent_response: str):
+    """Evaluates the agent's response using a Judge LLM."""
+    judge_prompt = f"""
+    You are an expert evaluator assessing the quality of responses given by a Graph Analysis AI Agent.
+    Your task is to evaluate the response based on:
+    1. **Correctness**: Is the response factually accurate based on graph theory?
+    2. **Relevance**: Does it directly answer the given question?
+    3. **Clarity**: Is the response well-structured and easy to understand?
+
+    Here is the question:
+    "{question}"
+
+    Here is the agent's response:
+    "{agent_response}"
+
+    Provide a structured assessment in JSON format with the following keys:
+    - correctness (score out of 10)
+    - relevance (score out of 10)
+    - clarity (score out of 10)
+    - feedback (brief textual feedback on improvement)
+    """
+
+    judge_llm = create_judge_llm()
+    evaluation = judge_llm.invoke(judge_prompt)
+    return evaluation.content
+
 
-# Example usage:
 if __name__ == "__main__":
-    # Basic usage
     question = "find node with highest betweenness centrality in the graph"
-    result = run_graph_analysis_agent(question)
-    print(f"Basic result: {result}")
+    response, evaluation = run_graph_analysis_agent(question)
+
+    print("Agent Response:")
+    print(response)
+
+    print("\nEvaluation:")
+    print(evaluation)
+
 
     #Streaming usage
     print("\nStreaming results:")