In [1]:
from dotenv import load_dotenv
import os

# Langchain
from langchain_community.graphs import Neo4jGraph
import chunking
# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'


kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

# 1. Splitting the Chunks

In [3]:
#  Load JSON file
file = "../data/json/Napoleon.json"

In [4]:
file_chunks = chunking.split_data_from_file(file)

['General Information', 'Career', 'Death', 'Source']
Processing General Information from ../data/json/Napoleon.json
	Split into 18 chunks
Processing Career from ../data/json/Napoleon.json
	Split into 34 chunks
Processing Death from ../data/json/Napoleon.json
	Split into 8 chunks
Processing Source from ../data/json/Napoleon.json
	Split into 1 chunks


# 2. Create Chunk node in KnowledgeGraph with properties extracted from file

In [5]:
# Create Napoleon_Chunk node and its properties
merge_chunk_node_query = """
MERGE(mergedChunk:Napoleon_Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET
        mergedChunk.text = $chunkParam.text, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.formItem = $chunkParam.formItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId
RETURN mergedChunk
"""
kg.query(merge_chunk_node_query, 
         params={'chunkParam':file_chunks[0]})

[{'mergedChunk': {'formItem': 'General Information',
   'text': "Napoleon Bonaparte (born Napoleone di Buonaparte;[b] 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon\xa0I, was a French military and political leader who rose to prominence during the French Revolution and led a series of successful campaigns across Europe during the Revolutionary Wars and Napoleonic Wars from 1796 to 1815. He was the leader of the French Republic as First Consul from 1799 to 1804, then of the French Empire as Emperor of the French from 1804 to 1814, and briefly again in 1815.\nBorn on the island of Corsica to a family of Italian origin, Napoleon moved to mainland France in 1779 and was commissioned as an officer in the French Army in 1785. He supported the French Revolution in 1789, and promoted its cause in Corsica. He rose rapidly in the ranks after breaking the siege of Toulon in 1793 and firing on royalist insurgents in Paris on 13 Vendémiaire in 1795. In 1796, Napoleon commande

# 3. Create a uniqueness constraint to avoid duplicate chunks

In [6]:
#  Create a uniqueness constraint to avoid duplicate chunks
avoid_duplicate_chunks = """
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (nc:Napoleon_Chunk) REQUIRE nc.chunkId IS UNIQUE
"""

kg.query(avoid_duplicate_chunks)


[]

# 4. Adding all chunks data to knowledegeGraph

In [7]:
node_count = 0
for chunk in file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0000
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0001
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0002
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0003
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0004
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0005
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0006
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0007
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0008
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0009
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0010
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0011
Creating `:Chunk` node for chunk ID Napoleon-General Information-chunk0012
Creating `:Chunk` node fo

# 5. Create a VectorIndex for the Napoleon_Chunk

In [8]:
VectorIndex = """
         CREATE VECTOR INDEX `NapoleonOpenAI` IF NOT EXISTS
          FOR (nc:Napoleon_Chunk) ON (nc.textEmbeddingOpenAI) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
"""

kg.query(VectorIndex)


[]

# 6. Embedding the text data using a provider and add it to the Napoleon_Chunk node as a property named textEmbeddingOpenAI

In [9]:
kg.query("""
    MATCH (Napoleon_Chunk:Napoleon_Chunk) WHERE Napoleon_Chunk.textEmbeddingOpenAI IS NULL
    WITH Napoleon_Chunk, genai.vector.encode(
      Napoleon_Chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(Napoleon_Chunk, "textEmbeddingOpenAI", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

[]

# Optional: 6.1. If you want to Embed data using HuggingFace use this part

In [10]:
# # I am using T5 base but you can add more model in LLM file for yourself
# embedd_model = T5.t5_model()
# chunks = kg.query("MATCH (Napoleon_Chunk:Napoleon_Chunk) WHERE Napoleon_Chunk.textEmbedding IS NULL RETURN id(Napoleon_Chunk) as id, Napoleon_Chunk.text as text")


# for chunk in chunks:
#     text = chunk["text"]
#     print(f"Embedding Chunk {chunk['id']}")
#     vector = embedd_model.embed([text])

#     vector_list = vector.tolist()
#     print(vector_list)

#     kg.query(
#         """
#         MATCH (Napoleon_Chunk:Napoleon_Chunk) WHERE id(Napoleon_Chunk) = $id
#         SET Napoleon_Chunk.textEmbedding = $vector
#         """,
#         params={"id": chunk["id"], "vector": vector_list}
#     )

# 7. Extracting form information from Napoleon_Chunk: Each of the chunk contains information we need to create the Form node. 

In [10]:
cypher = """
  MATCH (anyChunk:Napoleon_Chunk) 
  WITH anyChunk LIMIT 1
  RETURN anyChunk {.source, .formItem } as formInfo
"""
form_info_list = kg.query(cypher)

form_info_list


[{'formInfo': {'source': 'Napoleon History',
   'formItem': 'General Information'}}]

In [12]:
form_item = list(set(item['formInfo'].get('formItem') for item in form_info_list))
source = list(set(item['formInfo'].get('source') for item in form_info_list))

# 8. Create Napoleon_Death, Napoleon_Career and Napoleon_General Node

In [13]:
# # Creating a form node using source
# cypher_template = """
#     MERGE (nf:{node_name})
#       ON CREATE 
#         SET nf.formItem = $formItem
#         RETURN COUNT(nf)
# """
# node_name = {'Napoleon_General': 'General Information', 'Napoleon_Career':'Career', 'Napoleon_Death':'Death'}
# for name, form_item in node_name.items():
#     cypher = cypher_template.format(node_name=name)
#     result = kg.query(cypher, params={'formItem': form_item})
#     print(f"{name} node creted {result}")

Napoleon_General node creted [{'COUNT(nf)': 1}]
Napoleon_Career node creted [{'COUNT(nf)': 1}]
Napoleon_Death node creted [{'COUNT(nf)': 1}]


# 9. Finding all Chunks from the same formItem

In [12]:
# Each chunk is a small part of the document. To do this, first we need to find all chunks that belong together
cypher = """
   MATCH (from_same_chunk_item:Napoleon_Chunk)
    WHERE from_same_chunk_item.formItem = $NapoleonParam
    AND from_same_chunk_item.formItem = $NapoleonParam
  RETURN from_same_chunk_item {.text, .formItem, .chunkId, .chunkSeqId } as chunkItemInfo
    ORDER BY from_same_chunk_item.chunkSeqId ASC
    LIMIT 1
"""

items = ['General Information', 'Career', 'Death']
for item in items:
  result = kg.query(cypher, params={'NapoleonParam':item})
  print(result)
  
 

[{'chunkItemInfo': {'text': "Napoleon Bonaparte (born Napoleone di Buonaparte;[b] 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon\xa0I, was a French military and political leader who rose to prominence during the French Revolution and led a series of successful campaigns across Europe during the Revolutionary Wars and Napoleonic Wars from 1796 to 1815. He was the leader of the French Republic as First Consul from 1799 to 1804, then of the French Empire as Emperor of the French from 1804 to 1814, and briefly again in 1815.\nBorn on the island of Corsica to a family of Italian origin, Napoleon moved to mainland France in 1779 and was commissioned as an officer in the French Army in 1785. He supported the French Revolution in 1789, and promoted its cause in Corsica. He rose rapidly in the ranks after breaking the siege of Toulon in 1793 and firing on royalist insurgents in Paris on 13 Vendémiaire in 1795. In 1796, Napoleon commanded a military campaign against the Au

# 11. Adding NEXT relationship

In [15]:
# Then we should remember to return all the chunks in order.
cypher = """
  MATCH (from_same_chunk_item:Napoleon_Chunk)
  WHERE from_same_chunk_item.formItem = $NapoleonParam
    AND from_same_chunk_item.formItem = $NapoleonParam
  WITH from_same_chunk_item
    ORDER BY from_same_chunk_item.chunkSeqId ASC
  WITH collect(from_same_chunk_item) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )
  RETURN size(section_chunk_list)
"""
items = ['General Information', 'Career', 'Death']
for item in items:
  result = kg.query(cypher, params={'NapoleonParam':item})
  print(f"for {item}: {result}" )


for General Information: [{'size(section_chunk_list)': 18}]
for Career: [{'size(section_chunk_list)': 34}]
for Death: [{'size(section_chunk_list)': 8}]


In [21]:
kg.refresh_schema()
print(kg.schema)

Node properties are the following:
Person {name: STRING},Napoleon_Chunk {chunkId: STRING, formItem: STRING, chunkSeqId: INTEGER, text: STRING, source: STRING, textEmbeddingOpenAI: LIST},Talleyrand_Chunk {chunkId: STRING, formItem: STRING, chunkSeqId: INTEGER, text: STRING, source: STRING, textEmbeddingOpenAI: LIST},Waterloo_Chunk {chunkId: STRING, formItem: STRING, chunkSeqId: INTEGER, text: STRING, source: STRING, textEmbeddingOpenAI: LIST},Event {name: STRING},Career {period: STRING, position: STRING, chunk_info: STRING},Death {date: STRING, location: STRING, chunk_info: STRING},General_info {chunk_info: STRING, knownFor: STRING, nationality: STRING, deathDate: STRING, birthDate: STRING, location: STRING, battleDate: STRING, outcome: STRING, commander: STRING},Reason {chunk_info: STRING, cause: STRING, strategicMistake: STRING, politicalImpact: STRING},Combatant {chunk_info: STRING, frenchCommander: STRING, alliedCommander: STRING, prussianCommander: STRING, mainForces: STRING},Conse