# Neo4j Hello World (Notebook) - SEC Use Case

This notebook connects to a local Neo4j **Community** instance (via Docker), creates a tiny graph, and queries it.

**Assumes** 
 
 
- Neo4j service is running at `bolt://localhost:${URI_PORT}` with the user and password set in the `.env` file. **Run `docker compose up -d`**.
- Ollama service is up on `http://localhost:11434` (ollama default). **Run `ollama serve` and pull the model `ollama pull nomic-embed-text`** (if not pulled yet).

In [None]:

# Dependencies

import os
from dotenv import load_dotenv  
import yaml
from pathlib import Path
from pprint import pprint
from termcolor import cprint
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from neo4j import GraphDatabase

from helper_neo4j import vectorize_property
from helper_neo4j import neo4j_KGRAG_search


In [2]:
# Environment variables

load_dotenv()  # Load local environment variables

URI = "bolt://localhost:" + os.environ.get("URI_PORT")
NEO4J_USER = os.environ.get("NEO4J_USER")
NEO4J_PWD = os.environ.get("NEO4J_PASSWORD")
NEO4J_DB = os.getenv("NEO4J_DATABASE", "neo4j")    # 👈 choose DB here

cprint(f"Connecting to Neo4j at {URI} with user {NEO4J_USER} and password {NEO4J_PWD}", "green")

[32mConnecting to Neo4j at bolt://localhost:7687 with user neo4j and password test1234[0m


In [3]:
# Load cypher queries

queries = yaml.safe_load(Path("queries_SEC.yaml").read_text())
queries.keys()  # list available queries

dict_keys(['constraints', 'create_chunks', 'create_vector_indexes', 'delete_all'])

In [None]:
# Neo4j Driver instance

driver = GraphDatabase.driver(uri=URI, auth=(NEO4J_USER, NEO4J_PWD))

## 1+2. Create data with rich text (chunks)

In [5]:
# Load data from file

file_name = "./data/form10k/0000950170-23-027948.json" # form10k for the Netapp company

# LangChain Text splitter for chunking process
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

def split_form10k_data_from_file(file):
    
    chunks_with_metadata = [] # accumlate chunk records
    
    data = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        
        print(f'Processing {item} from {file}') 
        
        item_text_chunks = text_splitter.split_text(data[item]) # split the text into chunks
        
        chunk_seq_id = 0
        for chunk in item_text_chunks: # only take the first 20 chunks
            
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'uuid': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': data['names'],
                'cik': data['cik'],
                'cusip6': data['cusip6'],
                'source': data['source'],
            })
            
            chunk_seq_id += 1
            
        print(f'\t{item} splitted into {chunk_seq_id} chunks')
        
    return chunks_with_metadata


chunks_dicts = split_form10k_data_from_file(file_name)

Processing item1 from ./data/form10k/0000950170-23-027948.json
	item1 splitted into 254 chunks
Processing item1a from ./data/form10k/0000950170-23-027948.json
	item1a splitted into 1 chunks
Processing item7 from ./data/form10k/0000950170-23-027948.json
	item7 splitted into 1 chunks
Processing item7a from ./data/form10k/0000950170-23-027948.json
	item7a splitted into 1 chunks


In [6]:
# Populate graph

with driver.session(database=NEO4J_DB) as session:
   
    dbinfo = session.run("CALL db.info()").single()
    cprint(f"\nConnected to Neo4j database: {dbinfo['name']}", "green")
    
    cprint("\nCreating constraints (if not exist)", "green")
    for q in queries["constraints"]:
        session.run(q)
    
    cprint("\nInit Cleanup.", "green")
    for q in queries["delete_all"]:
        session.run(q)
    
    cprint("\nCreate data", "green")
    node_count = 0
    for chunk_dict in chunks_dicts:
        print(f"Creating `:Chunk` node for chunk ID {chunk_dict['uuid']}")
        session.run(queries["create_chunks"], 
        parameters={
            'chunkParamDict': chunk_dict
            }
        )
        node_count += 1
        
    print(f"Created {node_count} nodes")

[32m
Connected to Neo4j database: neo4j[0m
[32m
Creating constraints (if not exist)[0m
[32m
Init Cleanup.[0m
[32m
Create data[0m
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0000
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0001
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0002
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0003
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0004
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0005
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0006
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0007
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0008
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0009
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0010
Creating `:Chunk` node for chunk ID 0000950170-23-027948-ite

In [None]:
# Create conections and form nodes

with driver.session(database=NEO4J_DB) as session:
    
    # Create a node to represent the entire Form 10-K
    # Get form metadata from any chunk

    result = session.run(queries["match_form_data"])
    form_info_list = list(result)
    
    if not form_info_list:
        print("No chunks found in the database")
    else:
        form_record = form_info_list[0]
        print("Form info retrieved:")
        pprint(dict(form_record))
        
        # Create the Form node with individual parameters
        cypher = """
            MERGE (f:Form {formId: $formId})
              ON CREATE 
                SET f.names = $names,
                    f.source = $source,
                    f.cik = $cik,
                    f.cusip6 = $cusip6
        """
        
        # Pass individual parameters instead of nested dictionary
        session.run(queries["create_form_node"], {
            'formId': form_record['formId'],
            'names': form_record['names'],
            'source': form_record['source'],
            'cik': form_record['cik'],
            'cusip6': form_record['cusip6']
        })
        
            
        # Verify the Form node was created, Show the created Form node details
        for q in queries["match_form"]:
            result = session.run(q)
            for r in result:
                pprint(dict(r))
            
        # Create a linked list of Chunk nodes for each section
        for form10kItemName in ['item1', 'item1a', 'item7', 'item7a']:
            session.run(queries["link_chunks_to_sections"], {'formId': form_record['formId'],
                                        'f10kItem': form10kItemName})
            
        # Connect chunks to their parent form with a PART_OF relationship
        session.run(queries["link_chunks_to_form"])
        
        # Create a SECTION relationship on first chunk of each section
        session.run(queries["link_section_chunk_to_form"])
    

Form info retrieved:
{'cik': '1002047',
 'cusip6': '64110D',
 'formId': '0000950170-23-027948',
 'names': ['Netapp Inc', 'NETAPP INC'],
 'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm'}
Forms created: {'formCount': 1}
Created Form node:
{'cik': '1002047',
 'cusip6': '64110D',
 'formId': '0000950170-23-027948',
 'names': ['Netapp Inc', 'NETAPP INC'],
 'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm'}


In [None]:
# Example cypher queries

with driver.session(database=NEO4J_DB) as session:

  # Return the first chunk of the Item 1 section
  result = session.run(queries["match_first_chunk_of_section"], 
                       {'formId': form_record['formId'],
                        'f10kItem': 'item1'})

  first_chunk_info = dict(list(result)[0])
  print(first_chunk_info)

  # Get the second chunk of the Item 1 section
  result = session.run(queries["match_second_chunk"], 
                       {'chunkId': first_chunk_info['uuid']})

  next_chunk_info = dict(list(result)[0])
  
  print(next_chunk_info)
  
  # See relationships between form node and the first and second chunks of each section. Try it out in browser!!
  result = session.run(queries["match_several_relations"], 
                       {'chunkId': first_chunk_info['uuid']})
  for r in result:
    print(dict(r))
    
  # Return a window of three chunks

  result = session.run(queries["match_window_1"], {'chunkId': next_chunk_info['uuid']})

  for r in result:
      print(dict(r))
      
  result = session.run(queries["match_window_2"], {'chunkId': first_chunk_info['uuid']})

  for r in result:
      print(dict(r))
  


{'uuid': '0000950170-23-027948-item1-chunk0000', 'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure 

## 3. Create property embeddings (first step into RAG) 

In [12]:
# Create vector indexes

with driver.session(database=NEO4J_DB) as session:
    
    # Create vector index
    for q in queries["create_vector_indexes"]:
        session.run(q)
    
    # Show created vector indexes
    results = session.run("SHOW VECTOR INDEXES")
    idx = list(results)
    cprint(f"\nFound {len(idx)} vector index entries.", "green")
    for r in idx:
        cprint("-"*20,"green")
        pprint(dict(r))


[32m
Found 4 vector index entries.[0m
[32m--------------------[0m
{'entityType': 'NODE',
 'id': 5,
 'indexProvider': 'vector-2.0',
 'labelsOrTypes': ['Chunk'],
 'lastRead': neo4j.time.DateTime(2025, 9, 23, 12, 39, 1, 589000000, tzinfo=<UTC>),
 'name': 'chunks_node_text_idx',
 'owningConstraint': None,
 'populationPercent': 100.0,
 'properties': ['embedding'],
 'readCount': 1,
 'state': 'ONLINE',
 'type': 'VECTOR'}
[32m--------------------[0m
{'entityType': 'NODE',
 'id': 18,
 'indexProvider': 'vector-2.0',
 'labelsOrTypes': ['Company'],
 'lastRead': neo4j.time.DateTime(2025, 9, 23, 12, 45, 32, 345000000, tzinfo=<UTC>),
 'name': 'company_node_text_idx',
 'owningConstraint': None,
 'populationPercent': 100.0,
 'properties': ['embedding'],
 'readCount': 1,
 'state': 'ONLINE',
 'type': 'VECTOR'}
[32m--------------------[0m
{'entityType': 'RELATIONSHIP',
 'id': 19,
 'indexProvider': 'vector-2.0',
 'labelsOrTypes': ['KNOWS'],
 'lastRead': neo4j.time.DateTime(2025, 9, 23, 12, 45, 32, 

In [None]:
# Create property embeddings 
    
with driver.session(database=NEO4J_DB) as session:
    
    vectorize_property(runner = session.run,
                       element = "node",
                       node_label = "Chunk",
                       source_property = "text"
                       )

[32m
Generating embeddings for (n:Chunk) on n.text[0m
[32m
Generating embeddings[0m
  input text: 'We generally enter into confidentiality agreements'...
  emb vec: [0.028159238, 0.03939935, -0.15910606, 0.02096453, 0.015713783, 0.017258478, -0.02857189, 0.022183841, 0.023328492, 0.018569319]

[32m
Generating embeddings[0m
  input text: 'We voluntarily measure, monitor, and publicly repo'...
  emb vec: [0.044200547, 0.04976897, -0.1890341, 0.018815977, 0.05476959, 0.037146453, 0.005691045, 0.042080145, 0.004473105, -0.005582745]

[32m
Generating embeddings[0m
  input text: 'We are subject to international, federal, state, a'...
  emb vec: [0.036685962, 0.040971372, -0.17384344, 0.026266122, 0.072202004, 0.05600262, 0.0013872886, 0.03984249, 0.018242436, 0.0077151014]

[32m
Generating embeddings[0m
  input text: 'Human Capital


We take pride in, and believe our '...
  emb vec: [0.023966804, 0.085904166, -0.17040329, -0.014763556, 0.071859345, 0.058000352, 0.034313552, -0.0029

## 4. Search 

In [None]:
# KG RAG Search

with driver.session(database=NEO4J_DB) as session:
  
  # Query Nodes
  result = neo4j_KGRAG_search(runner = session.run,
                              element = "node",
                              query = 'In a single sentence, tell me about Netapp.',
                              index = 'chunks_node_text_idx',
                              source_property = "text",
                              top_k = 10
                              )

  pprint(result, width = 200, sort_dicts=False, indent=2)



[32m
Generating embeddings[0m
  input text: 'In a single sentence, tell me about Netapp.'...
  emb vec: [0.023942923, 0.06676347, -0.123865075, -0.024302177, 0.07153227, -0.02668256, 0.007319313, -0.033634715, -0.017225634, -0.0583832]

[32m
Running vector search query[0m
{'combined_context': '\n'
                     '\n'
                     ' •\n'
                     'NetApp Keystone is our pay-as-you-grow, '
                     'storage-as-a-service (STaaS) offering that delivers a '
                     'seamless hybrid cloud experience for those preferring '
                     'operating expense consumption models to upfront capital '
                     'expense or leasing. With a unified management console '
                     'and monthly bill for both on-premises and cloud data '
                     'storage services, Keystone lets organizations provision '
                     'and monitor, and even move storage spend across their '
                     'hybrid c

**Create conections:**

Chunks belong to Forms, Chunks follow other Chunks and some of them are the head of section of the Form.

<p align="center">
  <img src="media/KGRAG_SEC_example.png">
</p>

<p align="center">
  <img src="media/KGRAG_SEC_example_2.png">
</p>

