Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 82 additions & 24 deletions aql_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,9 @@




# Tools for each specialized agent
AQL_QUERY_TOOLS = [text_to_aql_to_text]


class AgentState(TypedDict):
messages: list
context: Dict
Expand All @@ -58,6 +56,72 @@ def create_llm():
)


def create_judge_llm():
"""Create a Judge LLM for evaluating the agent responses"""
return ChatOpenAI(
model="gpt-4o-mini",
temperature=0.1
)


def evaluate_with_judge(response: str, expected_answer: str):
"""
Evaluates the agent's response using Judge LLM.

Args:
response: The output generated by the agent.
expected_answer: The correct or expected answer for comparison.

Returns:
Evaluation result (e.g., score, feedback).
"""

# Define the evaluation prompt for the Judge LLM
evaluation_prompt = f"""
You are an expert medical data evaluator specializing in ArangoDB graph queries and their medical interpretations. Your task is to evaluate the response generated by an AI agent that converts a medical question into an AQL query, executes it, and interprets the results.

Please follow these guidelines:

1. **AQL Query Validation**:
- Check if the agent correctly translated the natural language question into an AQL query. Was the AQL query appropriate for the task?
- Ensure the query is syntactically correct and uses the correct database schema and field names.
- Was the query optimized (e.g., no unnecessary joins or filtering) to retrieve the correct data efficiently?

2. **Medical Accuracy**:
- Ensure the interpretation of the query result is accurate from a medical standpoint.
- Did the agent provide a correct and medically meaningful interpretation of the query result (e.g., average age, medical conditions, etc.)?
- Was the response medically relevant and easy to understand for healthcare professionals?

3. **Clarity and Structure**:
- Was the response clear and structured, following proper medical terminology?
- Did the agent provide adequate context or explanations to ensure a healthcare professional would understand the results?
- Did the agent avoid jargon that might confuse non-technical medical users?

4. **Feedback**:
- If the agent’s response is incorrect or incomplete, provide detailed feedback on what went wrong.
- Suggest improvements in terms of both query formulation and interpretation.

5. **Scoring**:
- Provide a score between 1 and 10, where 10 means the response was perfect, and 1 means the response was completely incorrect or misleading.

Please provide the score, and a detailed explanation for your evaluation.

---

Agent's Response: {response}
Expected Answer: {expected_answer}
"""

judge_llm = create_judge_llm()

# Get the evaluation result from the Judge LLM
evaluation_result = judge_llm.invoke({"messages": [("human", evaluation_prompt)]})

if evaluation_result["messages"]:
return evaluation_result["messages"][-1].content
else:
return "No evaluation generated"


def create_agent(state, memory, llm: ChatOpenAI, tools: list, system_prompt: str, max_iterations: int = 2, max_execution_time: int = 120) -> AgentExecutor:
"""
Expand Down Expand Up @@ -91,6 +155,7 @@ def _modify_state_messages(state, memory):

return agent


system_prompt = f"""
You are an expert medical data analyst specializing in ArangoDB graph queries.

Expand All @@ -117,8 +182,6 @@ def aql_query_node(state):
Handles fundamental analysis and financial metrics using tools from tools.py
"""



try:
aql_agent = create_agent(
llm,
Expand Down Expand Up @@ -170,10 +233,11 @@ def aql_query_node(state):

def run_aql_agent(state, memory, question: str, current_date: str = None):
"""
Runs the financial agent with the given question.
Runs the financial agent with the given question and evaluates the result.

Args:
question: The user's question
expected_answer: The correct or expected answer for comparison
current_date: Optional date context
"""
# Initialize state with datetime object
Expand All @@ -189,7 +253,7 @@ def run_aql_agent(state, memory, question: str, current_date: str = None):
"aql_query_agent_internal_state": {}
}

# Basic usage
# Run the agent
try:
# Create agent directly like in the other methods
aql_agent = create_agent(
Expand All @@ -202,18 +266,22 @@ def run_aql_agent(state, memory, question: str, current_date: str = None):
max_execution_time=120
)

# Use invoke instead of aql_query_node
result = aql_agent.invoke(
{"messages": initial_state["messages"]},
{"callbacks": [initial_state["callback"]]}
{"callbacks": [initial_state["callback"]]},
)
memory.add(f"User: {state["messages"][-1]}\nAssistant: {result}", user_id=state["mem0_user_id"], agent_id=state["agent_id"])
print("memory -:", memory.get_all(user_id=state["mem0_user_id"], agent_id=state["agent_id"]))
# Get the last message content
if result["messages"] and len(result["messages"]) > 0:
return result["messages"][-1].content
agent_response = result["messages"][-1].content
else:
return "No response generated"
agent_response = "No response generated"

# Evaluate the response using Judge LLM
evaluation = evaluate_with_judge(agent_response, expected_answer)

return agent_response, evaluation

except Exception as e:
return f"Error running agent: {str(e)}"
Expand Down Expand Up @@ -316,20 +384,10 @@ async def run_aql_agent_async(question: str, current_date: str = None, timeout:

# Example usage:
if __name__ == "__main__":
# Basic usage
question = "What is the average age of patients in the database?"
result = run_aql_agent(question)
print(f"Basic result: {result}")

#Streaming usage
print("\nStreaming results:")
for chunk in run_aql_agent_with_stream(question):
print(f"Chunk: {chunk}")
expected_answer = "The average age of patients in the database is 45." # Replace with the correct expected answer

# Async usage with timeout
print("\nAsync result:")
async def main():
result = await run_aql_agent_async(question, timeout=300)
print(f"Async result: {result}")
agent_response, evaluation = run_aql_agent(question, expected_answer)

asyncio.run(main())
print(f"Agent's Response: {agent_response}")
print(f"Evaluation: {evaluation}")
94 changes: 53 additions & 41 deletions graph_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,52 +156,27 @@ def graph_analysis_node(state):
return state


def run_graph_analysis_agent(question: str, current_date: str = None):
def run_graph_analysis_agent(question: str):
"""
Runs the financial agent with the given question.

Args:
question: The user's question
current_date: Optional date context
Runs the graph analysis agent with the given question.
"""
# Initialize state with datetime object
if current_date and isinstance(current_date, str):
current_date = datetime.strptime(current_date, "%Y-%m-%d")
else:
current_date = datetime.now()

initial_state = {
"messages": [("human", question)],
"user_input": question,
"callback": CustomConsoleCallbackHandler(),
"graph_analysis_agent_internal_state": {}
}

# Basic usage
try:
# Create agent directly like in the other methods
graph_analysis_agent = create_agent(
llm,
GRAPH_ANALYSIS_TOOLS,
system_prompt,
max_iterations=2,
max_execution_time=120
)

# Use invoke instead of aql_query_node
result = graph_analysis_agent.invoke(
{"messages": initial_state["messages"]},
{"callbacks": [initial_state["callback"]]}
)

# Get the last message content
if result["messages"] and len(result["messages"]) > 0:
return result["messages"][-1].content
else:
return "No response generated"

except Exception as e:
return f"Error running agent: {str(e)}"
graph_analysis_agent = create_agent(llm, GRAPH_ANALYSIS_TOOLS, system_prompt)
result = graph_analysis_agent.invoke(
{"messages": initial_state["messages"]},
{"callbacks": [initial_state["callback"]]}
)
response_text = result["messages"][-1].content if result["messages"] else "No response generated"

evaluation = evaluate_response(question, response_text)

return response_text, evaluation

def run_graph_analysis_agent_with_stream(question: str, current_date: str = None):
"""
Expand Down Expand Up @@ -295,13 +270,50 @@ async def run_graph_analysis_agent_async(question: str, current_date: str = None
except Exception as e:
return f"Error running agent: {str(e)}"

def create_judge_llm():
"""Create the judge LLM for evaluation"""
return ChatOpenAI(
model="gpt-4o-mini",
temperature=0.1
)

def evaluate_response(question: str, agent_response: str):
"""Evaluates the agent's response using a Judge LLM."""
judge_prompt = f"""
You are an expert evaluator assessing the quality of responses given by a Graph Analysis AI Agent.
Your task is to evaluate the response based on:
1. **Correctness**: Is the response factually accurate based on graph theory?
2. **Relevance**: Does it directly answer the given question?
3. **Clarity**: Is the response well-structured and easy to understand?

Here is the question:
"{question}"

Here is the agent's response:
"{agent_response}"

Provide a structured assessment in JSON format with the following keys:
- correctness (score out of 10)
- relevance (score out of 10)
- clarity (score out of 10)
- feedback (brief textual feedback on improvement)
"""

judge_llm = create_judge_llm()
evaluation = judge_llm.invoke(judge_prompt)
return evaluation.content


# Example usage:
if __name__ == "__main__":
# Basic usage
question = "find node with highest betweenness centrality in the graph"
result = run_graph_analysis_agent(question)
print(f"Basic result: {result}")
response, evaluation = run_graph_analysis_agent(question)

print("Agent Response:")
print(response)

print("\nEvaluation:")
print(evaluation)


#Streaming usage
print("\nStreaming results:")
Expand Down