In [84]:
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv  
import yaml
from pathlib import Path
from pprint import pprint
from termcolor import cprint
import ollama
import requests
from typing import Literal

from neo4j import GraphDatabase


In [85]:
load_dotenv()  # Load local environment variables

URI = "bolt://localhost:" + os.environ.get("URI_PORT")
NEO4J_USER = os.environ.get("NEO4J_USER")
NEO4J_PWD = os.environ.get("NEO4J_PASSWORD")
NEO4J_DB = os.getenv("NEO4J_DATABASE", "neo4j")    # 👈 choose DB here
EMBED_MODEL = "nomic-embed-text:latest"

cprint(f"Connecting to Neo4j at {URI} with user {NEO4J_USER} and password {NEO4J_PWD}", "green")

[32mConnecting to Neo4j at bolt://localhost:7687 with user neo4j and password test1234[0m


In [86]:
first_file_name = "./data/form10k/0000950170-23-027948.json"

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

def split_form10k_data_from_file(file):
    
    chunks_with_metadata = [] # accumlate chunk records
    
    data = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        
        print(f'Processing {item} from {file}') 
        
        item_text_chunks = text_splitter.split_text(data[item]) # split the text into chunks
        
        chunk_seq_id = 0
        for chunk in item_text_chunks: # only take the first 20 chunks
            
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': data['names'],
                'cik': data['cik'],
                'cusip6': data['cusip6'],
                'source': data['source'],
            })
            
            chunk_seq_id += 1
            
        print(f'\t{item} splitted into {chunk_seq_id} chunks')
        
    return chunks_with_metadata


chunks_dicts = split_form10k_data_from_file(first_file_name)

Processing item1 from ./data/form10k/0000950170-23-027948.json
	item1 splitted into 254 chunks
Processing item1a from ./data/form10k/0000950170-23-027948.json
	item1a splitted into 1 chunks
Processing item7 from ./data/form10k/0000950170-23-027948.json
	item7 splitted into 1 chunks
Processing item7a from ./data/form10k/0000950170-23-027948.json
	item7a splitted into 1 chunks


In [87]:
driver = GraphDatabase.driver(uri=URI, auth=(NEO4J_USER, NEO4J_PWD))


In [88]:
wipe_at_init = True # delete everything at the start 

with driver.session(database=NEO4J_DB) as session:
    dbinfo = session.run("CALL db.info()").single()
    cprint(f"\n== Connected to Neo4j database: {dbinfo['name']}", "green")
    session.run("""
    CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
        FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
    """)
    node_count = 0

    for chunk_dict in chunks_dicts[:20]:
        print(f"Creating `:Chunk` node for chunk ID {chunk_dict['chunkId']}")
        session.run("""MERGE(c:Chunk {chunkId: $chunkParamDict.chunkId})
            ON CREATE SET 
                c.names = $chunkParamDict.names,
                c.formId = $chunkParamDict.formId, 
                c.cik = $chunkParamDict.cik, 
                c.cusip6 = $chunkParamDict.cusip6, 
                c.source = $chunkParamDict.source, 
                c.f10kItem = $chunkParamDict.f10kItem, 
                c.chunkSeqId = $chunkParamDict.chunkSeqId, 
                c.text = $chunkParamDict.text
        RETURN c""", 
        parameters={
            'chunkParamDict': chunk_dict
            }
        )
        node_count += 1
print(f"Created {node_count} nodes")

[32m
== Connected to Neo4j database: neo4j[0m
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0000
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0001
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0002
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0003
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0004
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0005
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0006
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0007
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0008
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0009
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0010
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0011
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0012
Cre

In [89]:
with driver.session(database=NEO4J_DB) as session:
    
    # Create vector index
    session.run("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
         FOR (c:Chunk) ON (c.text_emb) 
         OPTIONS { indexConfig: { 
         `vector.dimensions`: 768, `vector.similarity_function`: 'cosine' 
         } }
         """)
    
    # Show created vector indexes
    results = session.run("SHOW VECTOR INDEXES")
    idx = list(results)
    cprint(f"\nFound {len(idx)} vector index entries.", "green")
    for r in idx:
        cprint("-"*20,"green")
        pprint(dict(r))
        
    cprint(f"\nGenerating embeddings for (n:Chunk) on n.text", "green")
    records = list(session.run(f"""
        MATCH (n:Chunk)
        WHERE n.text IS NOT NULL AND n.text <> ''
        AND n.text_emb IS NULL
        RETURN n.chunkId AS chunkId, n.text AS txt
        """))
    for r in records:
        vec = ollama.embed(model="nomic-embed-text", input=r["txt"])["embeddings"][0]
        session.run(
            f"""
            MATCH (n:Chunk {{chunkId: $chunkId}})
            SET n.text_emb = $vec
            """,
            chunkId=r["chunkId"], vec=vec
        )
        print(f"  text: {r['txt']}\n  vec: {vec[:3]}")

[32m
Found 1 vector index entries.[0m
[32m--------------------[0m
{'entityType': 'NODE',
 'id': 5,
 'indexProvider': 'vector-2.0',
 'labelsOrTypes': ['Chunk'],
 'lastRead': None,
 'name': 'form_10k_chunks',
 'owningConstraint': None,
 'populationPercent': 0.0,
 'properties': ['text_emb'],
 'readCount': None,
 'state': 'POPULATING',
 'type': 'VECTOR'}
[32m
Generating embeddings for (n:Chunk) on n.text[0m


In [90]:
# From user query/question to question embedding
def create_question_embedding(question:str):
    cprint(f"\nGenerating embeddings for question '{question}'", "green")
    vec = ollama.embed(model="nomic-embed-text", input=question)["embeddings"][0] 
    print(f"  text: {question}\n  vec: {vec[:10]}\n")
    return vec
  
# From query/question to cypher query language (cql) TODO
def create_question_cql(question:str):
    cql_query = ""
    #cql_query = "MATCH (person)-[:KNOWS]-(:Person {name:'Cristina'})" 
    return cql_query

In [None]:

  
def neo4j_node_vector_search(question, index_name):
  """Search for similar nodes using the Neo4j vector index"""
  
  with driver.session(database=NEO4J_DB) as session:
      
      top_k = 10
      vector_search_query = f"""
      CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) 
      YIELD node, score
      {create_question_cql(question)}
      RETURN score, node.text AS text
      """
  
      res = session.run(
        vector_search_query, 
        {"index_name": index_name, 
         "top_k": top_k,
         "question_embedding": create_question_embedding(question),}
        )
      result = list(res)
      
  return result


result = neo4j_node_vector_search(
    'In a single sentence, tell me about Netapp.',
    'form_10k_chunks'
)

for r in result:
    print(dict(r))



[32m
Generating embeddings for question 'In a single sentence, tell me about Netapp.'[0m
  text: In a single sentence, tell me about Netapp.
  vec: [0.023942923, 0.06676347, -0.123865075, -0.024302177, 0.07153227, -0.02668256, 0.007319313, -0.033634715, -0.017225634, -0.0583832]

<Record score=0.8422031402587891 text='•\nNetApp Keystone is our pay-as-you-grow, storage-as-a-service (STaaS) offering that delivers a seamless hybrid cloud experience for those preferring operating expense consumption models to upfront capital expense or leasing. With a unified management console and monthly bill for both on-premises and cloud data storage services, Keystone lets organizations provision and monitor, and even move storage spend across their hybrid cloud environment for financial and operational flexibility. \n\n\n•\nNetApp Global Support supplies systems, processes, and people wherever needed to provide continuous operation in complex and critical environments, with an emphasis on proactive