In [16]:
import neo4j
from neo4j import GraphDatabase
import json
from dotenv import load_dotenv
import os

load_dotenv()

URI = os.getenv('NEO4J_URI')
USERNAME = os.getenv('NEO4J_USERNAME')
PASSWORD = os.getenv('NEO4J_PASSWORD')

QUESTION = "How many times did Epstein meet with Bill Clinton?"

driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))
try:
    driver.verify_connectivity()
    print("✅ Successfully connected to Neo4j!")
except Exception as e:
    print(f"❌ Connection failed: {e}")

✅ Successfully connected to Neo4j!


In [3]:
from openai import OpenAI
from typing import List, Dict, Any

client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [4]:
def vector_search(query: str, TOP_K = 5) -> List[Dict]:

    query_vector = get_embedding(query)

    cypher = """
    CALL db.index.vector.queryNodes(
        'event_embedding',
        $topK,
        $embedding
    )
    YIELD node, score

    MATCH (actor:Entity)-[:PARTICIPATED_IN {role:'actor'}]->(node)
    MATCH (target:Entity)-[:PARTICIPATED_IN {role:'target'}]->(node)
    OPTIONAL MATCH (node)-[:LOCATED_IN]->(l:Location)
    OPTIONAL MATCH (node)-[:HAS_TAG]->(t:Tag)

    RETURN
        node.id AS event_id,
        actor.name AS actor,
        node.action AS action,
        target.name AS target,
        l.name AS location,
        node.explicit_topic AS explicit_topic,
        collect(DISTINCT t.name) AS tags,
        score
    ORDER BY score DESC
    """

    with driver.session() as session:
        results = session.run(
            cypher,
            embedding=query_vector,
            topK=TOP_K
        )

        return [dict(record) for record in results]

In [5]:
vector_search(QUESTION)

[{'event_id': 9825,
  'actor': 'Jeffrey Epstein',
  'action': 'maintained relationship with',
  'target': 'multiple billionaires and world leaders',
  'location': 'Manhattan',
  'explicit_topic': 'continued cultivation of high-level relationships',
  'tags': ['Travel Logistics', 'Victim Advocacy', 'Media Strategy'],
  'score': 0.8232795596122742},
 {'event_id': 15868,
  'actor': 'Jeffrey Epstein',
  'action': 'identified targets in to',
  'target': 'unknown person A (HOUSE_OVERSIGHT_027281) (HOUSE_OVERSIGHT_027281)',
  'location': None,
  'explicit_topic': 'noting Landowski, Cohen, and Weisselberg are mentioned in complaint',
  'tags': ['Media Coverage', 'Legal Representation'],
  'score': 0.8230377435684204},
 {'event_id': 16039,
  'actor': 'Jeffrey Epstein',
  'action': 'clarified political target to',
  'target': 'unknown person A (HOUSE_OVERSIGHT_027346) (HOUSE_OVERSIGHT_027346)',
  'location': None,
  'explicit_topic': 'focus on Charles Koch as primary target rather than David Koc

In [6]:
def hybrid_search(query: str, TOP_K = 5) -> List[Dict]:

    query_vector = get_embedding(query)

    HYBRID_QUERY = """
    CALL {
        // Vector index
        CALL db.index.vector.queryNodes('event_embedding', $k, $embedding)
        YIELD node, score
        WITH collect({node: node, score: score}) AS nodes, max(score) AS max
        UNWIND nodes AS n
        RETURN n.node AS node, (n.score / max) AS score

        UNION

        // Fulltext index
        CALL db.index.fulltext.queryNodes('event_text', $q, {limit: $k})
        YIELD node, score
        WITH collect({node: node, score: score}) AS nodes, max(score) AS max
        UNWIND nodes AS n
        RETURN n.node AS node, (n.score / max) AS score
    }

    // Deduplicate + rerank
    WITH node, max(score) AS score
    ORDER BY score DESC
    LIMIT $k

    RETURN
        node.id AS id,
        node.text AS text,
        score
    """
    
    with driver.session() as session:
        results = session.run(
            HYBRID_QUERY,
            q=query,
            embedding=query_vector,
            k=TOP_K,
        )

        return [dict(r) for r in results]

In [7]:
hybrid_search(QUESTION)

[{'id': 9825,
  'text': 'Jeffrey Epstein maintained relationship with multiple billionaires and world leaders. Location: Manhattan. Topic: continued cultivation of high-level relationships. Context: persistence_of_influence_despite_criminal_conviction.',
  'score': 1.0},
 {'id': 48834,
  'text': 'Don Marquis said A hypocrite is a person who— but who isn’t?. Topic: philosophical quotation on hypocrisy. Context: providing reflective commentary.',
  'score': 1.0},
 {'id': 15868,
  'text': 'Jeffrey Epstein identified targets in to unknown person A (HOUSE_OVERSIGHT_027281) (HOUSE_OVERSIGHT_027281). Topic: noting Landowski, Cohen, and Weisselberg are mentioned in complaint. Context: tracking who faces legal exposure from Trump Foundation litigation.',
  'score': 0.9997062771193206},
 {'id': 16039,
  'text': 'Jeffrey Epstein clarified political target to unknown person A (HOUSE_OVERSIGHT_027346) (HOUSE_OVERSIGHT_027346). Topic: focus on Charles Koch as primary target rather than David Koch. C

In [8]:
stepback_system_message = f""" 
You are an expert at world knowledge. Your task is to step back
and paraphrase a question to a more generic step-back question, which
is easier to answer. Here are a few examples
 
"input": "Could the members of The Police perform lawful arrests?"
"output": "what can the members of The Police do?"

"input": "Jefferey Epstein’s was born in what country?"
"output": "what is Jefferey Epstein’s personal history?"
"""

In [9]:
def generate_stepback(question: str):
    user_message = f"""{question}"""
    step_back_question = client.chat.completions.create(
        model="gpt-5-mini",
        input=[
            {"role": "system", "content": stepback_system_message},
            {"role": "user", "content": user_message},
        ]
    ).output_text
    return step_back_question

In [10]:
def get_structured_schema(driver: neo4j.Driver) -> Dict[str, Any]:

    NODE_PROPERTIES_QUERY = """
    CALL apoc.meta.data()
    YIELD label, other, elementType, type, property
    WHERE NOT type = "RELATIONSHIP" AND elementType = "node"
    WITH label AS nodeLabels, collect({property:property, type:type}) AS properties
    RETURN {labels: nodeLabels, properties: properties} AS output
    """
    REL_PROPERTIES_QUERY = """
    CALL apoc.meta.data()
    YIELD label, other, elementType, type, property
    WHERE NOT type = "RELATIONSHIP" AND elementType = "relationship"
    WITH label AS relType, collect({property:property, type:type}) AS properties
    RETURN {type: relType, properties: properties} AS output
    """
    REL_QUERY = """
    CALL apoc.meta.data()
    YIELD label, other, elementType, type, property
    WHERE type = "RELATIONSHIP" AND elementType = "node"
    UNWIND other AS other_node
    RETURN {start: label, type: property, end: toString(other_node)} AS output
    """

    def _run_query(session, q):
        return [rec.get("output") for rec in session.run(q)]

    with driver.session() as session:
        node_properties = _run_query(session, NODE_PROPERTIES_QUERY)
        rel_properties = _run_query(session, REL_PROPERTIES_QUERY)
        relationships = _run_query(session, REL_QUERY)

    node_props = {el["labels"]: el["properties"] for el in node_properties if el}
    rel_props = {el["type"]: el["properties"] for el in rel_properties if el}

    return {
        "node_props": node_props,
        "rel_props": rel_props,
        "relationships": relationships,
    }

In [11]:
def get_schema(structured_schema: Dict[str, Any]) -> str:
    def _format_props(props: List[Dict[str, Any]]) -> str:
        return ", ".join(f"{prop['property']}: {prop['type']}" for prop in props)

    formatted_node_props = [
        f"{label} {{{_format_props(props)}}}"
        for label, props in structured_schema.get("node_props", {}).items()
    ]

    formatted_rel_props = [
        f"{rel_type} {{{_format_props(props)}}}"
        for rel_type, props in structured_schema.get("rel_props", {}).items()
    ]

    formatted_rels = [
        f"(:{element['start']})-[:{element['type']}]->(:{element['end']})"
        for element in structured_schema.get("relationships", [])
    ]

    return "\n".join(
        [
            "Node labels and properties:",
            "\n".join(formatted_node_props) if formatted_node_props else "  (none)",
            "Relationship types and properties:",
            "\n".join(formatted_rel_props) if formatted_rel_props else "  (none)",
            "The relationships:",
            "\n".join(formatted_rels) if formatted_rels else "  (none)",
        ]
    )

In [12]:
get_schema(get_structured_schema(driver))

'Node labels and properties:\nEntity {name: STRING}\nEvent {timestamp: STRING, embedding: LIST, text: STRING, created_at: DATE_TIME, id: INTEGER, implicit_topic: STRING, explicit_topic: STRING, action: STRING, sequence_order: INTEGER}\nDocument {doc_id: STRING}\nTag {name: STRING}\nLocation {name: STRING}\nRelationship types and properties:\nPARTICIPATED_IN {role: STRING}\nThe relationships:\n(:Entity)-[:PARTICIPATED_IN]->(:Event)\n(:Event)-[:HAS_TAG]->(:Tag)\n(:Event)-[:FROM_DOCUMENT]->(:Document)\n(:Event)-[:LOCATED_IN]->(:Location)'

In [13]:
prompt_template = """
Instructions:
Generate Cypher statement to query a graph database to get the data to answer 
the following user question.
Graph database schema:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided in 
the schema.
{schema}
Terminology mapping:
This section is helpful to map terminology between the user question and the 
graph database schema.
{terminology}
Examples:
The following examples provide useful patterns for querying the graph database.
{examples}
Format instructions:
Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to
construct a Cypher statement.
Do not include any text except the generated Cypher statement.
ONLY RESPOND WITH CYPHER—NO CODE BLOCKS.
User question: {question}
"""

In [14]:
structured_schema = get_structured_schema(driver)
schema_string = get_schema(structured_schema)

terminology_string = """
Entities:  When a user asks about a person by trade like politician, celebrity, millionaire, or criminal, they are referring to a node with the label 'Entity'.
Location: When a user asks about places.
"""

examples = [
    (
        "Who are the two people interated in most event together?",
        "MATCH (p1:Entity)-[:PARTICIPATED_IN]->(e:Event)<-[:PARTICIPATED_IN]-(p2:Entity) "
        "WHERE p1 <> p2 RETURN p1.name, p2.name, COUNT(e) AS EVENT_COUNT "
        "ORDER BY EVENT_COUNT DESC LIMIT 1"
    )
]

examples_text = "\n".join(f"Question: {q}\nCypher: {c}" for q, c in examples)

full_prompt = prompt_template.format(
    question=QUESTION,
    schema=schema_string,
    terminology=terminology_string,
    examples=examples_text,
)

print(full_prompt)


Instructions:
Generate Cypher statement to query a graph database to get the data to answer 
the following user question.
Graph database schema:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided in 
the schema.
Node labels and properties:
Entity {name: STRING}
Event {timestamp: STRING, id: INTEGER, embedding: LIST, text: STRING, implicit_topic: STRING, explicit_topic: STRING, sequence_order: INTEGER, created_at: DATE_TIME, action: STRING}
Document {doc_id: STRING}
Tag {name: STRING}
Location {name: STRING}
Relationship types and properties:
PARTICIPATED_IN {role: STRING}
The relationships:
(:Entity)-[:PARTICIPATED_IN]->(:Event)
(:Event)-[:HAS_TAG]->(:Tag)
(:Event)-[:FROM_DOCUMENT]->(:Document)
(:Event)-[:LOCATED_IN]->(:Location)
Terminology mapping:
This section is helpful to map terminology between the user question and the 
graph database schema.

Entities:  When a user asks about a pers

In [15]:
def generate_cypher(prompt: str):
    cypher = client.chat.completions.create(
        model="gpt-5-mini",
        messages=[
            {"role": "system", "content": terminology_string},
            {"role": "user", "content": prompt},
        ]
    ).choices[0].message.content
    return cypher

generate_cypher(full_prompt)

"MATCH (ep:Entity {name: 'Epstein'})-[:PARTICIPATED_IN]->(e:Event)<-[r:PARTICIPATED_IN]-(t:Entity)\nWHERE r.role IN ['target','victim'] AND t <> ep\nRETURN t.name AS name, COUNT(e) AS event_count\nORDER BY event_count DESC\nLIMIT 1"

##  Retriever agents

In [34]:
text2cypher_description = {
    "type": "function",
    "function": {
        "name": "text2cypher",
        "description": (
            "Generate a Cypher statement from a user question, execute it "
            "against the Neo4j database, and return the query results. "
            "Use this as a fallback when other tools don't fit."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "question": {
                    "type": "string",
                    "description": "The user question to find the answer for",
                }
            },
            "required": ["question"],
        },
    },
}

def text2cypher(question: str):
    cypher = generate_cypher(question)
    records, _, _ = driver.execute_query(cypher)
    return [record.data() for record in records]

event_by_person_description = {
    "type": "function",
    "function": {
        "name": "event_by_person",
        "description": "Get events in which a person participated by providing (part of) the person's name",
        "parameters": {
            "type": "object",
            "properties": {
                "person": {
                    "type": "string",
                    "description": "The person name (partial match, case insensitive)",
                }
            },
            "required": ["person"],
        },
    },
}

def event_by_person(person: str):
    """Return events for a person (partial, case-insensitive match)."""
    query = """
    MATCH (p:Entity)-[:PARTICIPATED_IN]->(e:Event)
    WHERE toLower(p.name) CONTAINS $person
    OPTIONAL MATCH (e)-[:LOCATED_IN]->(l:Location)
    OPTIONAL MATCH (e)-[:HAS_TAG]->(t:Tag)
    OPTIONAL MATCH (other:Entity)-[:PARTICIPATED_IN]->(e)
    RETURN e AS event,
           p.name AS matched_person,
           collect(DISTINCT other.name) AS participants,
           l.name AS location,
           collect(DISTINCT t.name) AS tags,
           e.timestamp AS timestamp
    ORDER BY e.timestamp DESC
    LIMIT 50
    """
    with driver.session() as session:
        results = session.run(query, person=person.lower())
        return [record.data() for record in results]


person_by_event_description = {
    "type": "function",
    "function": {
        "name": "person_by_event",
        "description": "Get people involved in an event by providing part of the event text or topic",
        "parameters": {
            "type": "object",
            "properties": {
                "event": {
                    "type": "string",
                    "description": "A substring of the event text or topic to match (case insensitive)",
                }
            },
            "required": ["event"],
        },
    },
}

def person_by_event(event: str):
    """Return people participating in events that match the given event text (partial, case-insensitive)."""
    query = """
    MATCH (e:Event)
    WHERE toLower(e.text) CONTAINS $event OR toLower(e.explicit_topic) CONTAINS $event OR toLower(e.implicit_topic) CONTAINS $event
    OPTIONAL MATCH (p:Entity)-[:PARTICIPATED_IN]->(e)
    RETURN e AS event,
           collect(DISTINCT p.name) AS participants,
           e.timestamp AS timestamp,
           e.explicit_topic AS explicit_topic,
           e.implicit_topic AS implicit_topic
    ORDER BY e.timestamp DESC
    LIMIT 50
    """
    with driver.session() as session:
        results = session.run(query, event=event.lower())
        return [record.data() for record in results]
    
answer_given_description = {
    "type": "function",
    "function": {
        "name": "answer_given",
        "description": "If a complete answer to the question is already provided in the conversation, use this tool to extract it.",
        "parameters": {
            "type": "object",
            "properties": {
                "answer": {
                    "type": "string",
                    "description": "The answer to the question",
                }
            },
            "required": ["answer"],
        },
    },
}

def answer_given(answer: str):
    """Return the provided answer text as-is."""
    return answer

In [35]:
def handle_tool_calls(tools: dict[str, any], llm_tool_calls: list[dict[str, any]]):
    output = []
    if llm_tool_calls:
        for tool_call in llm_tool_calls:
            function_to_call = tools[tool_call.function.name]["function"]
            function_args = json.loads(tool_call.function.arguments)
            res = function_to_call(**function_args)
            output.append(res)
    return output

In [36]:
query_update_prompt = """
 You are an expert at updating questions to make them more atomic, 
specific, and easier to find the answer to.
 You do this by filling in missing information in the question, with the 
extra information provided to you in previous answers.
 You respond with the updated question that has all information in it.
 Only edit the question if needed. If the original question already is 
atomic, specific, and easy to answer, you keep the original.
 Do not ask for more information than the original question. Only rephrase 
the question to make it more complete.
 JSON template to use:
 {
 "question": "question1"
 }
"""

In [37]:
def query_update(input: str, answers: List[Any]) -> str:
    messages = [{"role": "system", "content": query_update_prompt}]

    for ans in answers:
        if isinstance(ans, dict) and "role" in ans and "content" in ans:
            messages.append(ans)
        else:
            messages.append({"role": "assistant", "content": str(ans)})

    messages.append({"role": "user", "content": f"The user question to rewrite: '{input}'"})

    try:
        resp = client.chat.completions.create(model="gpt-5-mini", messages=messages)
        output_text = resp.choices[0].message.content
        parsed = json.loads(output_text)
        return parsed.get("question", input)
    except json.JSONDecodeError:
        print("Error decoding JSON from model response")
        return input
    except Exception as e:
        print(f"Error calling model: {e}")
        return input

In [38]:
tools = {
    "event_by_person": {
        "description": event_by_person_description,
        "function": event_by_person,
    },
    "person_by_event": {
        "description": person_by_event_description,
        "function": person_by_event,
    },
    "text2cypher": {
        "description": text2cypher_description,
        "function": text2cypher,
    },
    "answer_given": {
        "description": answer_given_description,
        "function": answer_given,
    },
}

In [39]:
tool_picker_prompt = """
 Your job is to choose the right tool needed to respond to the user 
question.
 The available tools are provided to you in the request.
 Make sure to pass the right and complete arguments to the chosen tool.
"""

In [40]:
def tool_choice(messages: List[Dict[str, Any]], temperature: float = 0.0, model: str = "gpt-4o", tools: Dict[str, Any] = None, config={}) -> list[dict[str, any]]:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        tools=tools or None,
        **config    
    )
    return response.choices[0].message.tool_calls

In [41]:
def route_question(question: str, tools: dict[str, any], answers: list[dict[str, str]]):
    llm_tool_calls = tool_choice(
                    [
                        {
                            "role": "system",
                            "content": tool_picker_prompt,
                        },
                        *answers,
                        {
                            "role": "user",
                            "content": f"The user question to find a tool to answer: '{question}'",
                        },
                    ],
                    model = "gpt-4o",
                    tools=[tool["description"] for tool in tools.values()],
                    )
    return handle_tool_calls(tools, llm_tool_calls)

In [42]:
route_question(QUESTION, tools, [])

CypherSyntaxError: {neo4j_code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'There': expected
  "ALTER"
  "CALL"
  "CREATE"
  "DEALLOCATE"
  "DELETE"
  "DENY"
  "DETACH"
  "DROP"
  "DRYRUN"
  "ENABLE"
  "FOREACH"
  "GRANT"
  "LOAD"
  "MATCH"
  "MERGE"
  "NODETACH"
  "OPTIONAL"
  "REALLOCATE"
  "REMOVE"
  "RENAME"
  "RETURN"
  "REVOKE"
  "SET"
  "SHOW"
  "START"
  "STOP"
  "TERMINATE"
  "UNWIND"
  "USE"
  "USING"
  "WITH" (line 1, column 1 (offset: 0))
"There’s no single, publicly verified count of how many times Jeffrey Epstein and Bill Clinton met in person."
 ^} {gql_status: 50N42} {gql_status_description: error: general processing exception - unexpected error. Invalid input 'There': expected
  "ALTER"
  "CALL"
  "CREATE"
  "DEALLOCATE"
  "DELETE"
  "DENY"
  "DETACH"
  "DROP"
  "DRYRUN"
  "ENABLE"
  "FOREACH"
  "GRANT"
  "LOAD"
  "MATCH"
  "MERGE"
  "NODETACH"
  "OPTIONAL"
  "REALLOCATE"
  "REMOVE"
  "RENAME"
  "RETURN"
  "REVOKE"
  "SET"
  "SHOW"
  "START"
  "STOP"
  "TERMINATE"
  "UNWIND"
  "USE"
  "USING"
  "WITH" (line 1, column 1 (offset: 0))
"There’s no single, publicly verified count of how many times Jeffrey Epstein and Bill Clinton met in person."
 ^}

In [70]:
def handle_user_input(user_input: str, answers: List[Dict[str, str]] = None) -> List[Dict[str, str]]:
    if answers is None:
        answers = []
    print(f"Original question: {user_input}")
    updated_question = query_update(user_input, answers)
    print(f"Updated question: {updated_question}")
    response = route_question(updated_question, tools, answers)
    print(f"Tool response: {response}")
    answers.append({
        "role": "assistant",
        "content": json.dumps({
            "question": updated_question,
            "response": response
        }, ensure_ascii=False)
    })

    return answers

In [56]:
answer_critique_prompt = """
 You are an expert at identifying if questions have been fully answered or 
if there is an opportunity to enrich the answer.
 The user will provide a question, and you will scan through the provided 
information to see if the question is answered.
 If anything is missing from the answer, you will provide a set of new 
questions that can be asked to gather the missing information.
 All new questions must be complete, atomic, and specific.
 However, if the provided information is enough to answer the original 
question, you will respond with an empty list.
 JSON template to use for finding missing information:
 {
 "questions": ["question1", "question2"]
 }
"""

In [57]:
def critique_answers(question: str, answers: List[Dict[str, str]]) -> List[str]:
    messages = [
        {"role": "system", "content": answer_critique_prompt},
        *answers,
        {"role": "user", "content": f"The original user question to answer: '{question}'"},
    ]

    try:
        resp = client.chat.completions.create(model="gpt-5-mini", messages=messages)
        output_text = resp.choices[0].message.content
        parsed = json.loads(output_text)
        return parsed.get("questions", [])
    except json.JSONDecodeError:
        print("Error decoding JSON from model response")
        return []
    except Exception as e:
        print(f"Error calling model: {e}")
        return []


In [58]:
main_prompt = """
 Your job is to help the user with their questions.
 You will receive user questions and information needed to answer the 
questions
 If the information is missing to answer part of or the whole question, 
you will say that the information
 is missing. You will only use the information provided to you in the 
prompt to answer the questions.
 You are not allowed to make anything up or use external information.
"""

In [60]:
def chat(messages: List[Dict[str, Any]], temperature: float = 0.0, model: str = "gpt-4o", tools: Dict[str, Any] = None, config={}) -> list[dict[str, any]]:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        **config    
    )
    return response.choices[0].message.content

In [67]:
def main(input: str):
    answers = handle_user_input(input)
    critique = critique_answers(input, answers)
    if critique:
        answers = handle_user_input(" ".join(critique), answers)
    llm_response = chat(
        [
            {"role": "system", "content": main_prompt},
            *answers,
            {"role": "user", "content": f"The user question to answer: {input}"},
        ],
        model="gpt-4o",
    )  
    print(llm_response)

In [71]:
main("Who go with Epstein to the island?")

Original question: Who go with Epstein to the island?
Updated question: Who went with Jeffrey Epstein to his private island?
LLM tool calls: None
Tool response: []
Original question: Which island do you mean — Jeffrey Epstein's Little Saint James (U.S. Virgin Islands) or a different property? Are you asking about everyone who ever visited the island, or about who specifically travelled there with Epstein on a particular trip? If you mean a particular trip, what date or year (or approximate time period) are you asking about? Do you want a comprehensive list of all documented visitors, or only notable/public figures? Should the answer include only names verified by public records (flight logs, court filings, witness statements) or also include unverified allegations and media reports? Do you want each person's role or relationship to Epstein noted (for example: guest, staff, pilot, bodyguard, associate)? Do you require source citations for each name (links or references to the specific r