### Explore Arango knowledge graph

In [None]:
from arango import ArangoClient
from arango_datasets import Datasets
import asyncio
from dotenv import load_dotenv
import os

load_dotenv()

# Connect to database
db = ArangoClient(hosts=os.getenv("ARANGO_HOST")).db(
    username=os.getenv("ARANGO_USERNAME"),
    password=os.getenv("ARANGO_PASSWORD"),
    verify=True,
)

# Connect to datasets
datasets = Datasets(db)

# List datasets
print(datasets.list_datasets())

async def load_dataset(dataset_name):
    print(f"Loading dataset: {dataset_name}")
    datasets.load(dataset_name)
    print(f"Finished loading: {dataset_name}")

async def main():
    # Load 3 datasets simultaneously
    dataset_names = ["CVE", "OPEN_INTELLIGENCE"]  # FLIGHTS, SYNTHEA_P100
    tasks = [load_dataset(name) for name in dataset_names]
    await asyncio.gather(*tasks)

await main()

In [5]:
for graph in db.graphs():
    print(graph['id'])

_graphs/Christmas_Carol
_graphs/SYNTHEA_P100
_graphs/CVE
_graphs/OPEN_INTELLIGENCE
_graphs/FLIGHTS


In [37]:
from langchain_community.graphs import ArangoGraph
from langchain.chains import ArangoGraphQAChain
from langchain_openai import ChatOpenAI
import json


graph = ArangoGraph(db)
model = ChatOpenAI(temperature=0, model="gpt-4o")

with open('graph_schema.json', 'r') as f:
    graph_schema = json.load(f)
graph.set_schema(graph_schema)

chain = ArangoGraphQAChain.from_llm(
    model, graph=graph, 
    verbose=False, allow_dangerous_requests=True
)



In [7]:
response = chain.run(
    "Which patients have had encounters at Fitchburg Outpatient Clinic and were prescribed medications during those visits? write down the medications prescribed to them."
)
print(response)

  response = chain.run(




[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH patients, encounters, organizations, medications, organizations_to_encounters, patients_to_encounters, encounters_to_medications
FOR org IN organizations
  FILTER org.NAME == "Fitchburg Outpatient Clinic"
  FOR org_enc IN organizations_to_encounters
    FILTER org_enc._from == org._id
    FOR enc IN encounters
      FILTER enc._id == org_enc._to
      FOR pat_enc IN patients_to_encounters
        FILTER pat_enc._to == enc._id
        FOR pat IN patients
          FILTER pat._id == pat_enc._from
          FOR enc_med IN encounters_to_medications
            FILTER enc_med._from == enc._id
            FOR med IN medications
              FILTER med._id == enc_med._to
              RETURN {
                patient_id: pat._id,
                patient_name: CONCAT(pat.FIRST, " ", pat.LAST),
                medication_description: med.DESCRIPTION
              }
[0m
AQL Result:
[32;1m[1;3m[{'patient_id'

In [8]:
response = chain.run(
    "Find all providers who treated patients with a condition diagnosed during a routine pediatric checkup."
)
print(response)



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH providers, encounters, conditions, encounters_to_conditions, providers_to_encounters
FOR provider IN providers
    FOR encounter IN encounters
        FILTER encounter.DESCRIPTION == "Routine pediatric checkup"
        FOR condition IN conditions
            FILTER condition.ENCOUNTER == encounter._id
            FOR edge IN encounters_to_conditions
                FILTER edge._from == encounter._id AND edge._to == condition._id
                FOR providerEdge IN providers_to_encounters
                    FILTER providerEdge._from == provider._id AND providerEdge._to == encounter._id
                    RETURN DISTINCT provider
[0m
AQL Result:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
Summary:

There are no providers in the database who have treated patients with a condition diagnosed during a routine pediatric checkup.


the user means the same thing as "well child visit", but that isn't explicitly recorded in the description field. by conducting semantic search it could help enhance retrieval robustness. 

### Explore FAISS vector store

In [10]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [11]:
from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['1a96e418-019f-4072-b7e0-ea3b51f2670f',
 '5c3ec557-954a-411a-9b9f-fc87f6ea89f7',
 'fd35ad32-d5c2-4a44-b905-8e7738bf160f',
 '06b11605-7170-4774-8d29-0a25372cee1a',
 '2530583b-f7ed-4a86-8bee-eef407dfb9ac',
 '0d05d42b-f098-4354-a46c-3bce85203f57',
 '964040ec-714e-4b12-ad18-6f70bd7349d3',
 '68646204-b14f-412e-9bf3-82eb273fa342',
 'c0c49ea1-cccb-4298-8bee-7cb91016abb4',
 '6845e50a-7c72-4b8b-abc4-f4dd5608b87c']

In [12]:
vector_store.save_local("vector_store")

In [13]:
new_vector_store = FAISS.load_local(
    "vector_store", embeddings, allow_dangerous_deserialization=True
)

In [14]:
new_vector_store.similarity_search('how is the stock market doing?', k=2)

[Document(id='c0c49ea1-cccb-4298-8bee-7cb91016abb4', metadata={'source': 'news'}, page_content='The stock market is down 500 points today due to fears of a recession.'),
 Document(id='5c3ec557-954a-411a-9b9f-fc87f6ea89f7', metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.')]

### Ingest node descriptions

- embed all description fields
- store metadata of node key and collection name
- create lang tools for FAISS and Arango
- execute simple agent executor and assess solution

In [16]:
from tqdm import tqdm

collections = [
    "allergies",
    "careplans",
    "conditions",
    "devices",
    "encounters",
    "immunizations",
    "medications",
    "observations",
    "procedures",
    "supplies",
]

texts = []
metadatas = []

for col in tqdm(collections, desc="Processing collections"):
    query = f"""
    FOR doc IN {col}
      RETURN {{
        "_key": doc._key,
        "description": doc.DESCRIPTION,
        "collection": "{col}"
      }}
    """
    cursor = db.aql.execute(query)
    for doc in cursor:
        texts.append(doc["description"])
        metadatas.append({"_key": doc["_key"], "collection": doc["collection"]})

Processing collections:   0%|          | 0/10 [00:00<?, ?it/s]

Processing collections: 100%|██████████| 10/10 [07:21<00:00, 44.18s/it]


In [17]:
documents = []
for text, data in zip(texts, metadatas):
    documents.append(Document(page_content=text, metadata=data))

uuids = [str(uuid4()) for _ in range(len(documents))]

In [18]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
vector_store.add_documents(documents=documents, ids=uuids)
vector_store.save_local("synthea_p100_vector_store")

In [None]:
vector_store = FAISS.load_local(
    "synthea_p100_vector_store", embeddings, allow_dangerous_deserialization=True
)

In [25]:
results = vector_store.similarity_search(
    "routine pediatric checkup", k=4, filter={"collection": "encounters"}
)
results

[Document(id='869495ca-d91e-47f0-961a-4cef20a730fe', metadata={'_key': '430b6eab-89af-0188-780c-feeb6c1fa985', 'collection': 'encounters'}, page_content='Well child visit (procedure)'),
 Document(id='b5e37e27-210a-4b11-ae2b-368e7e44823d', metadata={'_key': '6bf2d2e6-33b3-6916-ca6c-737d32c005bf', 'collection': 'encounters'}, page_content='General examination of patient (procedure)'),
 Document(id='f0364bd9-e447-45ad-a430-a85c0742d40f', metadata={'_key': 'bd440abe-0a58-fede-e26a-314a29407b90', 'collection': 'encounters'}, page_content='Encounter for problem'),
 Document(id='b5e61b37-e5cc-40be-9871-8b3379a3eb9f', metadata={'_key': '89880de3-a6f9-c14f-a3b8-5f691aa9f21d', 'collection': 'encounters'}, page_content='General examination of patient (procedure)')]

### Build agent executor

- agent will have two tools: no sequence, free to call
- vector search tool can update graph state by writing metadata for graph traversal

TOOLKIT:  
should we have query decomposition tool?
- vector tool: collection and description to match
- graph tool: AQL query (not question, why? so agent can call this multiple times to correct any errors)

In [61]:
from langchain_core.tools import BaseTool
from typing import Type
from pydantic import BaseModel, Field
import json

class VectorSearchSchema(BaseModel):
    query: str = Field(..., description="The search query to find semantically similar information")
    collection: str = Field(..., description="The collection to search in")

class VectorSearchTool(BaseTool):
    name: str = "vector_search"
    description: str = "Search for semantically similar information in a vector database"
    args_schema: Type[BaseModel] = VectorSearchSchema

    def _run(self, query: str, collection: str) -> str:
        """Run vector search on a specific collection with the given query"""
        try:
            results = vector_store.similarity_search(query, k=3, filter={"collection": collection})

            formatted_results = []
            for doc in results:
                formatted_results.append(
                    {"content": doc.page_content, "metadata": doc.metadata}
                )

            return json.dumps(formatted_results, indent=2)
        
        except Exception as e:
            return f"Vector search failed: {str(e)}"


class GraphTraversalSchema(BaseModel):
    aql_query: str = Field(..., description="The AQL query to execute on the graph database")

class GraphTraversalTool(BaseTool):
    name: str = "graph_traversal"
    description: str = (
        "Execute AQL queries to traverse the graph and extract structured relationships"
    )
    args_schema: Type[BaseModel] = GraphTraversalSchema

    def _run(self, aql_query: str) -> str:
        """Execute an AQL query on ArangoDB"""
        try:
            results = graph.query(aql_query)
            return json.dumps({"results": results}, indent=2)
        except Exception as e:
            return f"Graph traversal failed: {str(e)}"

TOOLKIT = [VectorSearchTool(), GraphTraversalTool()]

In [84]:
from langchain_core.prompts import (
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
    PromptTemplate,
    ChatPromptTemplate,
)
from langchain_core.messages import SystemMessage

with open(r'prompts\vector_system_prompt.txt', 'r') as f:
    SYSTEM_PROMPT = f.read()

PROMPTS = [
    SystemMessage(content=SYSTEM_PROMPT),
    MessagesPlaceholder(variable_name="chat_history", optional=True),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            input_variables=["query", "graph_schema"],
            template="KNOWLEDGE GRAPH SCHEMA: {graph_schema} \n USER QUERY: {query} ",
        )
    ),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
]

CHAT_PROMPT = ChatPromptTemplate.from_messages(PROMPTS)

In [85]:
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor


agent = create_tool_calling_agent(model, TOOLKIT, CHAT_PROMPT)
agent_executor = AgentExecutor(agent=agent, tools=TOOLKIT, verbose=True)

In [86]:
response = agent_executor.invoke(
    {
        "graph_schema": graph.schema,
        "query": "Find all providers who treated patients with a condition diagnosed during a routine pediatric checkup.",
    }
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `vector_search` with `{'query': 'routine pediatric checkup', 'collection': 'encounters'}`


[0m[36;1m[1;3m[
  {
    "content": "Well child visit (procedure)",
    "metadata": {
      "_key": "430b6eab-89af-0188-780c-feeb6c1fa985",
      "collection": "encounters"
    }
  },
  {
    "content": "General examination of patient (procedure)",
    "metadata": {
      "_key": "6bf2d2e6-33b3-6916-ca6c-737d32c005bf",
      "collection": "encounters"
    }
  },
  {
    "content": "Encounter for problem",
    "metadata": {
      "_key": "bd440abe-0a58-fede-e26a-314a29407b90",
      "collection": "encounters"
    }
  }
][0m[32;1m[1;3m
Invoking: `graph_traversal` with `{'aql_query': 'FOR enc IN encounters\n  FILTER enc._key IN ["430b6eab-89af-0188-780c-feeb6c1fa985", "6bf2d2e6-33b3-6916-ca6c-737d32c005bf", "bd440abe-0a58-fede-e26a-314a29407b90"]\n  FOR cond IN 1..1 OUTBOUND enc encounters_to_conditions\n    FOR prov IN 1.

In [91]:
print(response['output'])

The provider who treated patients with a condition diagnosed during a routine pediatric checkup is:

- **Provider Name:** Danelle47 Ondricka197
- **Condition Diagnosed:** Full-time employment (finding)

This information was retrieved by first identifying encounters related to routine pediatric checkups and then traversing the graph to find the associated conditions and providers.
