# Neo4j Hello World (Notebook) - SEC Use Case

This notebook connects to a local Neo4j **Community** instance (via Docker), creates a tiny graph, and queries it.

**Assumes** 
 
 
- Neo4j service is running at `bolt://localhost:${URI_PORT}` with the user and password set in the `.env` file. **Run `docker compose up -d`**.
- Ollama service is up on `http://localhost:11434` (ollama default). **Run `ollama serve` and pull the model `ollama pull nomic-embed-text`** (if not pulled yet).

In [None]:

# Dependencies

import os
from dotenv import load_dotenv  
import yaml
from pathlib import Path
from pprint import pprint
from termcolor import cprint
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_neo4j import Neo4jGraph

from helper_neo4j import vectorize_property
from helper_neo4j import neo4j_KGRAG_search


In [None]:
# Environment variables

load_dotenv()  # Load local environment variables

URI = "bolt://localhost:" + os.environ.get("URI_PORT")
NEO4J_USER = os.environ.get("NEO4J_USER")
NEO4J_PWD = os.environ.get("NEO4J_PASSWORD")
NEO4J_DB = os.getenv("NEO4J_DATABASE", "neo4j")    # 👈 choose DB here

cprint(f"Connecting to Neo4j at {URI} with user {NEO4J_USER} and password {NEO4J_PWD}", "green")

In [None]:
# Load cypher queries

queries = yaml.safe_load(Path("queries_SEC.yaml").read_text())
queries.keys()  # list available queries

In [None]:
# Neo4j Langchain wrapper instance

kg = Neo4jGraph(url=URI, username=NEO4J_USER, password=NEO4J_PWD, database=NEO4J_DB)

## 1+2. Create data with rich text (chunks)

In [None]:
# Load data from file

file_name = "./data/form10k/0000950170-23-027948.json" # form10k for the Netapp company

# LangChain Text splitter for chunking process
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

def split_form10k_data_from_file(file):
    
    chunks_with_metadata = [] # accumlate chunk records
    
    data = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        
        print(f'Processing {item} from {file}') 
        
        item_text_chunks = text_splitter.split_text(data[item]) # split the text into chunks
        
        chunk_seq_id = 0
        for chunk in item_text_chunks: # only take the first 20 chunks
            
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'uuid': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': data['names'],
                'cik': data['cik'],
                'cusip6': data['cusip6'],
                'source': data['source'],
            })
            
            chunk_seq_id += 1
            
        print(f'\t{item} splitted into {chunk_seq_id} chunks')
        
    return chunks_with_metadata


chunks_dicts = split_form10k_data_from_file(file_name)

In [None]:
# Populate graph
   
dbinfo = kg.query("CALL db.info()")
cprint(f"\nConnected to Neo4j database: {dbinfo[0]['name']}", "green")

cprint("\nCreating constraints (if not exist)", "green")
for q in queries["constraints"]:
    kg.query(q)

cprint("\nInit Cleanup.", "green")
for q in queries["delete_all"]:
    kg.query(q)

cprint("\nCreate data", "green")
node_count = 0
for chunk_dict in chunks_dicts:
    print(f"Creating `:Chunk` node for chunk ID {chunk_dict['uuid']}")
    kg.query(queries["create_chunks"], 
    params={
        'chunkParamDict': chunk_dict
        }
    )
    node_count += 1
    
print(f"Created {node_count} nodes")

In [None]:
# Create conections and form nodes

    
# Create a node to represent the entire Form 10-K
# Get form metadata from any chunk

form_info_list = kg.query(queries["match_form_data"])

if not form_info_list:
    print("No chunks found in the database")
else:
    form_record = form_info_list[0]
    print("Form info retrieved:")
    pprint(form_record)
    
    # Create the Form node with individual parameters
    cypher = """
        MERGE (f:Form {formId: $formId})
            ON CREATE 
            SET f.names = $names,
                f.source = $source,
                f.cik = $cik,
                f.cusip6 = $cusip6
    """
    
    # Pass individual parameters instead of nested dictionary
    kg.query(queries["create_form_node"], {
        'formId': form_record['formId'],
        'names': form_record['names'],
        'source': form_record['source'],
        'cik': form_record['cik'],
        'cusip6': form_record['cusip6']
    })
    
        
    # Verify the Form node was created, Show the created Form node details
    for q in queries["match_form"]:
        result = kg.query(q)
        for r in result:
            pprint(dict(r))
        
    # Create a linked list of Chunk nodes for each section
    for form10kItemName in ['item1', 'item1a', 'item7', 'item7a']:
        kg.query(queries["link_chunks_to_sections"], {'formId': form_record['formId'],
                                    'f10kItem': form10kItemName})
        
    # Connect chunks to their parent form with a PART_OF relationship
    kg.query(queries["link_chunks_to_form"])
    
    # Create a SECTION relationship on first chunk of each section
    kg.query(queries["link_section_chunk_to_form"])
    

In [None]:
# Example cypher queries



# Return the first chunk of the Item 1 section
result = kg.query(queries["match_first_chunk_of_section"], 
                      {'formId': form_record['formId'],
                      'f10kItem': 'item1'})

first_chunk_info = dict(list(result)[0])
print(first_chunk_info)

# Get the second chunk of the Item 1 section
result = kg.query(queries["match_second_chunk"], 
                      {'uuid': first_chunk_info['uuid']})

next_chunk_info = dict(list(result)[0])

print(next_chunk_info)

# See relationships between form node and the first and second chunks of each section. Try it out in browser!!
result = kg.query(queries["match_several_relations"], 
                      {'uuid': first_chunk_info['uuid']})
for r in result:
  print(r)
  
# Return a window of three chunks

result = kg.query(queries["match_window_1"], {'uuid': next_chunk_info['uuid']})

for r in result:
    print(r)
    
result = kg.query(queries["match_window_2"], {'uuid': first_chunk_info['uuid']})

for r in result:
    print(r)
  


## 3. Create property embeddings (first step into RAG) 

In [None]:
# Create vector index
for q in queries["create_vector_indexes"]:
    kg.query(q)

# Show created vector indexes
results = kg.query("SHOW VECTOR INDEXES")
idx = results
cprint(f"\nFound {len(idx)} vector index entries.", "green")
for r in idx:
    cprint("-"*20,"green")
    pprint(r)


In [None]:
# Create property embeddings 
    
vectorize_property(runner = kg.query,
                    element = "node",
                    node_label = "Chunk",
                    source_property = "text"
                    )

## 4. Search 

In [None]:
# KG RAG Search

# Query Nodes
result = neo4j_KGRAG_search(runner = kg.query,
                            element = "node",
                            query = 'In a single sentence, tell me about Netapp.',
                            index = 'chunks_node_idx',
                            source_property = "text",
                            main_property = "uuid",
                            top_k = 10
                            )

pprint(result, width = 200, sort_dicts=False, indent=2)
file = "SEC_context.txt"
with open(file, 'w', encoding='utf-8') as f:
  f.write(result.get("combined_context",""))


**Create conections:**

Chunks belong to Forms, Chunks follow other Chunks and some of them are the head of section of the Form.

<p align="center">
  <img src="media/KGRAG_SEC_example.png">
</p>

<p align="center">
  <img src="media/KGRAG_SEC_example_2.png">
</p>



In [None]:
# Add investment information about what firms have invested in which companies (NetApp in our case is what matters)

import csv

all_form13s = []

with open('./data/form13.csv', mode = 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for r in csv_reader: # each row is a dictionary
        all_form13s.append(r)
        
len(all_form13s)

In [None]:
# For each investment we create a manager and a company node.

first_form13 = all_form13s[0]
cypher = """
MERGE (com:Company {cusip6: $cusip6})
    ON CREATE
        SET com.companyName = $companyName,
            com.cusip = $cusip
"""


kg.query(cypher,
            {'cusip6':first_form13['cusip6'],
                        'companyName':first_form13['companyName'],
                        'cusip':first_form13['cusip']
                        })

In [None]:
# For now, there is only one company - NetApp
cypher = """
MATCH (com:Company)
RETURN com LIMIT 1
"""

pprint(kg.query(cypher)[0])

In [None]:
# Update the company name to match Form 10-K

cypher = """
MATCH (com:Company), (form:Form)
WHERE com.cusip6 = form.cusip6
RETURN com.companyName, form.names
"""


pprint(kg.query(cypher)[0])

cypher = """
  MATCH (com:Company), (form:Form)
    WHERE com.cusip6 = form.cusip6
  SET com.names = form.names
"""


print(kg.query(cypher))

In [None]:
# Create a FILED relationship between the company and the Form-10K node
cypher ="""
  MATCH (com:Company), (form:Form)
    WHERE com.cusip6 = form.cusip6
  MERGE (com)-[:FILED]->(form)
"""

print(kg.query(cypher))

In [None]:
# Create manager nodes for companies that have filed a Form 13 to report their investment in NetApp
# Start with the single manager who filed the first Form 13 in the list

cypher = """
MERGE (mgr:Manager {uuid: $uuid})
ON CREATE
    SET mgr.managerName = $managerName,
        mgr.managerAddress = $managerAddress
"""

params = {
    "uuid": first_form13["managerCik"],
    "managerName": first_form13["managerName"],
    "managerAddress": first_form13["managerAddress"]
}

print(kg.query(cypher, params))


In [None]:
cypher = """
  MATCH (mgr:Manager)
  RETURN mgr LIMIT 1
"""


pprint(kg.query(cypher)[0])

In [None]:
# Create a uniquness constraint to avoid duplicate managers
cypher ="""
  CREATE CONSTRAINT manager_unique 
  IF NOT EXISTS
  FOR (n:Manager) 
  REQUIRE n.uuid IS UNIQUE
"""

print(kg.query(cypher))

In [None]:
# Create a fulltext index of manager names to enable text search

cypher ="""
CREATE FULLTEXT INDEX fullTextManagerNames
  IF NOT EXISTS
  FOR (mgr:Manager) 
  ON EACH [mgr.managerName]
"""

print(kg.query(cypher))

In [None]:
cypher ="""
CALL db.index.fulltext.queryNodes("fullTextManagerNames", 
      "royal bank") YIELD node, score
  RETURN node.managerName, score
"""

print(kg.query(cypher))

In [None]:
# Create nodes for all companies that filed a Form 13
cypher = """
  MERGE (mgr:Manager {uuid: $uuid})
    ON CREATE
        SET mgr.managerName = $managerName,
            mgr.managerAddress = $managerAddress
"""
for form13 in all_form13s:
  params = {
      "uuid": form13["managerCik"],
      "managerName": form13["managerName"],
      "managerAddress": form13["managerAddress"]
  }

  
print(kg.query(cypher, params))


# cypher = """
#   MERGE (mgr:Manager {uuid: $managerParam.managerCik})
#     ON CREATE
#         SET mgr.managerName = $managerParam.managerName,
#             mgr.managerAddress = $managerParam.managerAddress
# """
# # loop through all Form 13s
# for form13 in all_form13s:
#   kg.query(cypher, params={'managerParam': form13 })

In [None]:
cypher = """
    MATCH (mgr:Manager) 
    RETURN count(mgr)
"""


pprint(kg.query(cypher)[0])

In [None]:
# Create relationships between managers and companies
cypher = """
  MATCH (mgr:Manager {uuid: $investmentParam.managerCik}), 
        (com:Company {cusip6: $investmentParam.cusip6})
  RETURN mgr.managerName, com.companyName, $investmentParam as investment
"""

kg.query(cypher, params={ 
    'investmentParam': first_form13 
})

In [None]:
cypher = """
MATCH (mgr:Manager {uuid: $ownsParam.managerCik}), 
        (com:Company {cusip6: $ownsParam.cusip6})
MERGE (mgr)-[owns:OWNS_STOCK_IN { 
    reportCalendarOrQuarter: $ownsParam.reportCalendarOrQuarter
}]->(com)
ON CREATE
    SET owns.value  = toFloat($ownsParam.value), 
        owns.shares = toInteger($ownsParam.shares)
RETURN mgr.managerName, owns.reportCalendarOrQuarter, com.companyName
"""

kg.query(cypher, params={ 'ownsParam': first_form13 })

In [None]:
kg.query("""
MATCH (mgr:Manager {uuid: $ownsParam.managerCik})
-[owns:OWNS_STOCK_IN]->
        (com:Company {cusip6: $ownsParam.cusip6})
RETURN owns { .shares, .value }
""", params={ 'ownsParam': first_form13 })

In [None]:
# Create relationships between all of the managers who filed Form 13s and the company
cypher = """
MATCH (mgr:Manager {uuid: $ownsParam.managerCik}), 
        (com:Company {cusip6: $ownsParam.cusip6})
MERGE (mgr)-[owns:OWNS_STOCK_IN { 
    reportCalendarOrQuarter: $ownsParam.reportCalendarOrQuarter 
    }]->(com)
  ON CREATE
    SET owns.value  = toFloat($ownsParam.value), 
        owns.shares = toInteger($ownsParam.shares)
"""

#loop through all Form 13s
for form13 in all_form13s:
  kg.query(cypher, params={'ownsParam': form13 })

In [None]:
cypher = """
  MATCH (:Manager)-[owns:OWNS_STOCK_IN]->(:Company)
  RETURN count(owns) as investments
"""

kg.query(cypher)

In [None]:
kg.refresh_schema()
print(kg.schema, 60)

In [None]:
# Determine the number of investors
cypher = """
    MATCH (chunk:Chunk)
    RETURN chunk.uuid as uuid LIMIT 1
    """

chunk_rows = kg.query(cypher)
print(chunk_rows)
chunk_first_row = chunk_rows[0]
print(chunk_first_row)
ref_chunk_id = chunk_first_row['uuid']
print(ref_chunk_id)

In [None]:
# Build up path from Form 10-K chunk to companies and managers
cypher = """
    MATCH (:Chunk {uuid: $chunkIdParam})-[:PART_OF]->(f:Form)
    RETURN f.source
    """

kg.query(cypher, params={'chunkIdParam': ref_chunk_id})

In [None]:
cypher = """
MATCH (:Chunk {uuid: $chunkIdParam})-[:PART_OF]->(f:Form),
    (com:Company)-[:FILED]->(f)
RETURN com.companyName as name
"""

kg.query(cypher, params={'chunkIdParam': ref_chunk_id})

In [None]:
# Use queries to build additional context for LLM
cypher = """
MATCH (:Chunk {uuid: $chunkIdParam})-[:PART_OF]->(f:Form),
        (com:Company)-[:FILED]->(f),
        (mgr:Manager)-[:OWNS_STOCK_IN]->(com)
RETURN com.companyName, 
        count(mgr.managerName) as numberOfinvestors 
LIMIT 1
"""

kg.query(cypher, params={
    'chunkIdParam': ref_chunk_id
})

In [None]:
cypher = """
    MATCH (:Chunk {uuid: $chunkIdParam})-[:PART_OF]->(f:Form),
        (com:Company)-[:FILED]->(f),
        (mgr:Manager)-[owns:OWNS_STOCK_IN]->(com)
    RETURN mgr.managerName + " owns " + owns.shares + 
        " shares of " + com.companyName + 
        " at a value of $" + 
        apoc.number.format(toInteger(owns.value)) AS text
    LIMIT 10
    """
kg.query(cypher, params={
    'chunkIdParam': ref_chunk_id
})

In [None]:
results = kg.query(cypher, params={
    'chunkIdParam': ref_chunk_id
})
print(results[0]['text'], 60)

In [None]:
# Create a plain Question Answer chain: Similarity search only, no augmentation by Cypher Query

from langchain_community.vectorstores import Neo4jVector
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings


vector_store = Neo4jVector.from_existing_graph(
    embedding=OllamaEmbeddings(model='nomic-embed-text'),
    url=URI,
    username=NEO4J_USER,
    password=NEO4J_PWD,
    index_name="chunks_node_idx",
    node_label="Chunk",
    text_node_properties=["text"],
    embedding_node_property="embedding",
)
# Create a retriever from the vector store
retriever = vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
plain_chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOllama(model='qwen3:8b',temperature=0, reasoning = False), 
    chain_type="stuff", 
    retriever=retriever
)

In [None]:
# Create a second QA chain: Augment similarity search using sentences found by the investment query above
investment_retrieval_query = """
MATCH (node)-[:PART_OF]->(f:Form),
    (f)<-[:FILED]-(com:Company),
    (com)<-[owns:OWNS_STOCK_IN]-(mgr:Manager)
WITH node, score, mgr, owns, com 
    ORDER BY owns.shares DESC LIMIT 10
WITH collect (
    mgr.managerName + 
    " owns " + owns.shares + 
    " shares in " + com.companyName + 
    " at a value of $" + 
    apoc.number.format(toInteger(owns.value)) + "." 
) AS investment_statements, node, score
RETURN apoc.text.join(investment_statements, "\n") + 
    "\n" + node.text AS text,
    score,
    { 
      source: node.source
    } as metadata
"""

vector_store_with_investment = Neo4jVector.from_existing_index(
    OllamaEmbeddings(model='nomic-embed-text'),
    url=URI,
    username=NEO4J_USER,
    password=NEO4J_PWD,
    database=NEO4J_DB,
    index_name="chunks_node_idx",
    text_node_property="text",
    retrieval_query=investment_retrieval_query,
)

# Create a retriever from the vector store
retriever_with_investments = vector_store_with_investment.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
investment_chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOllama(model='qwen3:8b',temperature=0, reasoning = False), 
    chain_type="stuff", 
    retriever=retriever_with_investments
)

In [None]:
question = "In a single sentence, tell me about Netapp."
plain_chain(
    {"question": question},
    return_only_outputs=True,
)

In [None]:
investment_chain(
    {"question": question},
    return_only_outputs=True,
)

# Writing Cypher with an LLM

Use few-shot learning to teach an LLM to write Cypher
- You'll use the model ...
- You'll also use a new Neo4j integration within LangChain called **GraphCypherQAChain**

In [None]:
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain

CYPHER_GENERATION_TEMPLATE_V1 = """Task:Generate Cypher statement to 
query a graph database.
Instructions:
Use only the provided relationship types and properties in the 
schema. Do not use any other relationship types or properties that 
are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than 
for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher 
statements for particular questions:

# What investment firms are in San Francisco?
MATCH (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE mgrAddress.city = 'San Francisco'
RETURN mgr.managerName
The question is:
{question}"""



CYPHER_GENERATION_TEMPLATE_V2 = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:

# What investment firms are in San Francisco?
MATCH (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE mgrAddress.city = 'San Francisco'
RETURN mgr.managerName

# What investment firms are near Santa Clara?
  MATCH (address:Address)
    WHERE address.city = "Santa Clara"
  MATCH (mgr:Manager)-[:LOCATED_AT]->(managerAddress:Address)
    WHERE point.distance(address.location, 
        managerAddress.location) < 10000
  RETURN mgr.managerName, mgr.managerAddress

The question is:
{question}"""


CYPHER_GENERATION_TEMPLATE_V3 = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:

# What investment firms are in San Francisco?
MATCH (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE mgrAddress.city = 'San Francisco'
RETURN mgr.managerName

# What investment firms are near Santa Clara?
  MATCH (address:Address)
    WHERE address.city = "Santa Clara"
  MATCH (mgr:Manager)-[:LOCATED_AT]->(managerAddress:Address)
    WHERE point.distance(address.location, 
        managerAddress.location) < 10000
  RETURN mgr.managerName, mgr.managerAddress

# What does Palo Alto Networks do?
  CALL db.index.fulltext.queryNodes(
         "fullTextCompanyNames", 
         "Palo Alto Networks"
         ) YIELD node, score
  WITH node as com
  MATCH (com)-[:FILED]->(f:Form),
    (f)-[s:SECTION]->(c:Chunk)
  WHERE s.f10kItem = "item1"
RETURN c.text

The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], 
    template=CYPHER_GENERATION_TEMPLATE_V1
)

cypherChain = GraphCypherQAChain.from_llm(
    ChatOllama(model='qwen3:8b',temperature=0,reasoning=False),
    graph=kg,
    verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)

def prettyCypherChain(question: str) -> str:
    response = cypherChain.run(question)
    print(response, 60)
    
prettyCypherChain("What investment firms have invested more money?")

In [None]:
kg.refresh_schema

In [None]:
kg.refresh_schema()
print(kg.schema, 60)