In [2]:
import neo4j
from neo4j import GraphDatabase
import json
from dotenv import load_dotenv
import os

load_dotenv()

URI = os.getenv('NEO4J_URI')
USERNAME = os.getenv('NEO4J_USERNAME')
PASSWORD = os.getenv('NEO4J_PASSWORD')

QUESTION = "Who is Epstein's main target?"

driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))
try:
    driver.verify_connectivity()
    print("✅ Successfully connected to Neo4j!")
except Exception as e:
    print(f"❌ Connection failed: {e}")

✅ Successfully connected to Neo4j!


## Vector Search

In [3]:
from openai import OpenAI
from typing import List, Dict, Any

client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [4]:
def vector_search(query: str, TOP_K = 5) -> List[Dict]:

    query_vector = get_embedding(query)

    cypher = """
    CALL db.index.vector.queryNodes(
        'event_embedding',
        $topK,
        $embedding
    )
    YIELD node, score

    MATCH (actor:Entity)-[:PARTICIPATED_IN {role:'actor'}]->(node)
    MATCH (target:Entity)-[:PARTICIPATED_IN {role:'target'}]->(node)
    OPTIONAL MATCH (node)-[:LOCATED_IN]->(l:Location)
    OPTIONAL MATCH (node)-[:HAS_TAG]->(t:Tag)

    RETURN
        node.id AS event_id,
        actor.name AS actor,
        node.action AS action,
        target.name AS target,
        l.name AS location,
        node.explicit_topic AS explicit_topic,
        collect(DISTINCT t.name) AS tags,
        score
    ORDER BY score DESC
    """

    with driver.session() as session:
        results = session.run(
            cypher,
            embedding=query_vector,
            topK=TOP_K
        )

        return [dict(record) for record in results]

In [5]:
vector_search(QUESTION)

[{'event_id': 9825,
  'actor': 'Jeffrey Epstein',
  'action': 'maintained relationship with',
  'target': 'multiple billionaires and world leaders',
  'location': 'Manhattan',
  'explicit_topic': 'continued cultivation of high-level relationships',
  'tags': ['Travel Logistics', 'Victim Advocacy', 'Media Strategy'],
  'score': 0.8232795596122742},
 {'event_id': 15868,
  'actor': 'Jeffrey Epstein',
  'action': 'identified targets in to',
  'target': 'unknown person A (HOUSE_OVERSIGHT_027281) (HOUSE_OVERSIGHT_027281)',
  'location': None,
  'explicit_topic': 'noting Landowski, Cohen, and Weisselberg are mentioned in complaint',
  'tags': ['Media Coverage', 'Legal Representation'],
  'score': 0.8230377435684204},
 {'event_id': 16039,
  'actor': 'Jeffrey Epstein',
  'action': 'clarified political target to',
  'target': 'unknown person A (HOUSE_OVERSIGHT_027346) (HOUSE_OVERSIGHT_027346)',
  'location': None,
  'explicit_topic': 'focus on Charles Koch as primary target rather than David Koc

In [29]:
from typing import List, Dict


def format_events_for_llm(events: List[Dict]) -> str:
    
    context_blocks = [e['text'] + '\n' for e in events]
    
    return "\n---\n".join(context_blocks)

In [30]:
def generate_answer(question: str, events: List[Dict]) -> str:
    context = format_events_for_llm(events)

    system_message = """You're an Epstein expert, but can only use the 
    provided documents to respond to the questions."""

    user_message = f"""
            Use the following documents to answer the question that will follow:
            {context}
            ---
            The question to answer using information only from the above documents:
            {question}"""

    stream = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        stream=True,
    )

    for chunk in stream:
        print(chunk.choices[0].delta.content or "", end="")

In [34]:
def answer_question(question: str) -> str:
    events = vector_search(question)
    answer = generate_answer(question, events)
    return answer

In [None]:
answer_question(
    QUESTION
)

Jeffrey Epstein owned properties in several locations. These include a ranch in New Mexico, an apartment in Paris, a Caribbean island, a house in Palm Beach, and a property in Manhattan, New York.

## Hybrid search

In [14]:
def hybrid_search(query: str, TOP_K = 5) -> List[Dict]:

    query_vector = get_embedding(query)

    HYBRID_QUERY = """
    CALL {
        // Vector index
        CALL db.index.vector.queryNodes('event_embedding', $k, $embedding)
        YIELD node, score
        WITH collect({node: node, score: score}) AS nodes, max(score) AS max
        UNWIND nodes AS n
        RETURN n.node AS node, (n.score / max) AS score

        UNION

        // Fulltext index
        CALL db.index.fulltext.queryNodes('event_text', $q, {limit: $k})
        YIELD node, score
        WITH collect({node: node, score: score}) AS nodes, max(score) AS max
        UNWIND nodes AS n
        RETURN n.node AS node, (n.score / max) AS score
    }

    // Deduplicate + rerank
    WITH node, max(score) AS score
    ORDER BY score DESC
    LIMIT $k

    RETURN
        node.id AS id,
        node.text AS text,
        score
    """
    
    with driver.session() as session:
        results = session.run(
            HYBRID_QUERY,
            q=query,
            embedding=query_vector,
            k=TOP_K,
        )

        return [dict(r) for r in results]

In [19]:
hybrid_search(QUESTION)

[{'id': 9825,
  'text': 'Jeffrey Epstein maintained relationship with multiple billionaires and world leaders. Location: Manhattan. Topic: continued cultivation of high-level relationships. Context: persistence_of_influence_despite_criminal_conviction.',
  'score': 1.0},
 {'id': 48834,
  'text': 'Don Marquis said A hypocrite is a person who— but who isn’t?. Topic: philosophical quotation on hypocrisy. Context: providing reflective commentary.',
  'score': 1.0},
 {'id': 15868,
  'text': 'Jeffrey Epstein identified targets in to unknown person A (HOUSE_OVERSIGHT_027281) (HOUSE_OVERSIGHT_027281). Topic: noting Landowski, Cohen, and Weisselberg are mentioned in complaint. Context: tracking who faces legal exposure from Trump Foundation litigation.',
  'score': 0.9997062771193206},
 {'id': 16039,
  'text': 'Jeffrey Epstein clarified political target to unknown person A (HOUSE_OVERSIGHT_027346) (HOUSE_OVERSIGHT_027346). Topic: focus on Charles Koch as primary target rather than David Koch. C

## Step-back prompting

In [20]:
stepback_system_message = f""" 
You are an expert at world knowledge. Your task is to step back
and paraphrase a question to a more generic step-back question, which
is easier to answer. Here are a few examples
 
"input": "Could the members of The Police perform lawful arrests?"
"output": "what can the members of The Police do?"

"input": "Jefferey Epstein’s was born in what country?"
"output": "what is Jefferey Epstein’s personal history?"
"""

In [21]:
def generate_stepback(question: str):
    user_message = f"""{question}"""
    step_back_question = client.responses.create(
        model="gpt-5-mini",
        input=[
            {"role": "system", "content": stepback_system_message},
            {"role": "user", "content": user_message},
        ]
    ).output_text
    return step_back_question

In [22]:
step_back_question = generate_stepback(QUESTION)
print(f"Stepback results: {step_back_question}")

Stepback results: what kinds of people did Epstein target?


In [33]:
def rag_pipeline(question: str) -> str:
    stepback_prompt = generate_stepback(question)
    print(stepback_prompt)
    events = hybrid_search(stepback_prompt)
    print(events)
    answer = generate_answer(question, events)
    return answer

In [34]:
rag_pipeline(QUESTION)

what types of people did Jeffrey Epstein target?
[{'id': 3608, 'text': 'Jeffrey Epstein deliberately targeted disadvantaged girls from single-parent families. Topic: predatory targeting of economically vulnerable youth. Context: exploitation of social inequality and poverty.', 'score': 1.0}, {'id': 55920, 'text': 'M. Hoffman coauthored Cooperate without looking: Why we care what people think and not just what they do. Topic: study of cooperation based on perceived thoughts. Context: exploration of social influence on behavior.', 'score': 1.0}, {'id': 33285, 'text': 'Jeffrey Epstein exploited through connections and shared with adult male peers including royalty, politicians, academicians, businessmen. Topic: trafficking victim to high-level powerful individuals. Context: leveraging victim for networking and influence cultivation.', 'score': 0.9752890527647559}, {'id': 48044, 'text': 'Jeffrey Epstein enjoys social life with young women. Topic: documented preference for socializing with 

## Text to Cypher

In [37]:
def get_structured_schema(driver: neo4j.Driver) -> Dict[str, Any]:

    NODE_PROPERTIES_QUERY = """
    CALL apoc.meta.data()
    YIELD label, other, elementType, type, property
    WHERE NOT type = "RELATIONSHIP" AND elementType = "node"
    WITH label AS nodeLabels, collect({property:property, type:type}) AS properties
    RETURN {labels: nodeLabels, properties: properties} AS output
    """
    REL_PROPERTIES_QUERY = """
    CALL apoc.meta.data()
    YIELD label, other, elementType, type, property
    WHERE NOT type = "RELATIONSHIP" AND elementType = "relationship"
    WITH label AS relType, collect({property:property, type:type}) AS properties
    RETURN {type: relType, properties: properties} AS output
    """
    REL_QUERY = """
    CALL apoc.meta.data()
    YIELD label, other, elementType, type, property
    WHERE type = "RELATIONSHIP" AND elementType = "node"
    UNWIND other AS other_node
    RETURN {start: label, type: property, end: toString(other_node)} AS output
    """

    def _run_query(session, q):
        return [rec.get("output") for rec in session.run(q)]

    with driver.session() as session:
        node_properties = _run_query(session, NODE_PROPERTIES_QUERY)
        rel_properties = _run_query(session, REL_PROPERTIES_QUERY)
        relationships = _run_query(session, REL_QUERY)

    node_props = {el["labels"]: el["properties"] for el in node_properties if el}
    rel_props = {el["type"]: el["properties"] for el in rel_properties if el}

    return {
        "node_props": node_props,
        "rel_props": rel_props,
        "relationships": relationships,
    }

In [38]:
def get_schema(structured_schema: Dict[str, Any]) -> str:
    def _format_props(props: List[Dict[str, Any]]) -> str:
        return ", ".join(f"{prop['property']}: {prop['type']}" for prop in props)

    formatted_node_props = [
        f"{label} {{{_format_props(props)}}}"
        for label, props in structured_schema.get("node_props", {}).items()
    ]

    formatted_rel_props = [
        f"{rel_type} {{{_format_props(props)}}}"
        for rel_type, props in structured_schema.get("rel_props", {}).items()
    ]

    formatted_rels = [
        f"(:{element['start']})-[:{element['type']}]->(:{element['end']})"
        for element in structured_schema.get("relationships", [])
    ]

    return "\n".join(
        [
            "Node labels and properties:",
            "\n".join(formatted_node_props) if formatted_node_props else "  (none)",
            "Relationship types and properties:",
            "\n".join(formatted_rel_props) if formatted_rel_props else "  (none)",
            "The relationships:",
            "\n".join(formatted_rels) if formatted_rels else "  (none)",
        ]
    )

In [39]:
get_schema(get_structured_schema(driver))

'Node labels and properties:\nEntity {name: STRING}\nEvent {timestamp: STRING, text: STRING, created_at: DATE_TIME, embedding: LIST, sequence_order: INTEGER, explicit_topic: STRING, implicit_topic: STRING, id: INTEGER, action: STRING}\nDocument {doc_id: STRING}\nTag {name: STRING}\nLocation {name: STRING}\nRelationship types and properties:\nPARTICIPATED_IN {role: STRING}\nThe relationships:\n(:Entity)-[:PARTICIPATED_IN]->(:Event)\n(:Event)-[:FROM_DOCUMENT]->(:Document)\n(:Event)-[:HAS_TAG]->(:Tag)\n(:Event)-[:LOCATED_IN]->(:Location)'

In [42]:
prompt_template = """
Instructions:
Generate Cypher statement to query a graph database to get the data to answer 
the following user question.
Graph database schema:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided in 
the schema.
{schema}
Terminology mapping:
This section is helpful to map terminology between the user question and the 
graph database schema.
{terminology}
Examples:
The following examples provide useful patterns for querying the graph database.
{examples}
Format instructions:
Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to
construct a Cypher statement.
Do not include any text except the generated Cypher statement.
ONLY RESPOND WITH CYPHER—NO CODE BLOCKS.
User question: {question}
"""

In [44]:
structured_schema = get_structured_schema(driver)
schema_string = get_schema(structured_schema)

terminology_string = """
Entities:  When a user asks about a person by trade like politician, celebrity, millionaire, or criminal, they are referring to a node with the label 'Entity'.
Location: When a user asks about places.
"""

examples = [
    (
        "Who are the two people interated in most event together?",
        "MATCH (p1:Entity)-[:PARTICIPATED_IN]->(e:Event)<-[:PARTICIPATED_IN]-(p2:Entity) "
        "WHERE p1 <> p2 RETURN p1.name, p2.name, COUNT(e) AS EVENT_COUNT "
        "ORDER BY EVENT_COUNT DESC LIMIT 1"
    )
]

examples_text = "\n".join(f"Question: {q}\nCypher: {c}" for q, c in examples)

full_prompt = prompt_template.format(
    question=QUESTION,
    schema=schema_string,
    terminology=terminology_string,
    examples=examples_text,
)

print(full_prompt)


Instructions:
Generate Cypher statement to query a graph database to get the data to answer 
the following user question.
Graph database schema:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided in 
the schema.
Node labels and properties:
Entity {name: STRING}
Event {timestamp: STRING, embedding: LIST, text: STRING, created_at: DATE_TIME, id: INTEGER, implicit_topic: STRING, explicit_topic: STRING, action: STRING, sequence_order: INTEGER}
Document {doc_id: STRING}
Tag {name: STRING}
Location {name: STRING}
Relationship types and properties:
PARTICIPATED_IN {role: STRING}
The relationships:
(:Entity)-[:PARTICIPATED_IN]->(:Event)
(:Event)-[:FROM_DOCUMENT]->(:Document)
(:Event)-[:HAS_TAG]->(:Tag)
(:Event)-[:LOCATED_IN]->(:Location)
Terminology mapping:
This section is helpful to map terminology between the user question and the 
graph database schema.

Entities:  When a user asks about a pers

In [45]:
def generate_cypher(prompt: str):
    step_back_question = client.responses.create(
        model="gpt-5-mini",
        input=[
            {"role": "system", "content": stepback_system_message},
            {"role": "user", "content": prompt},
        ]
    ).output_text
    return step_back_question
generate_cypher(full_prompt)

'MATCH (ep:Entity)-[r1:PARTICIPATED_IN]->(ev:Event)<-[r2:PARTICIPATED_IN]-(t:Entity)\nWHERE toLower(ep.name) CONTAINS "epstein" AND r2.role = "target" AND t <> ep\nRETURN t.name AS target_name, COUNT(ev) AS event_count\nORDER BY event_count DESC\nLIMIT 1'