In [1]:
from dotenv import load_dotenv
import os

# Langchain
from langchain_community.graphs import Neo4jGraph
import chunking
# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'


kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

# 1. Splitting the Chunks

In [3]:
#  Load JSON file
file = "../data/json/Battle of Waterloo.json"

In [4]:
file_chunks = chunking.split_data_from_file(file)

['General information', 'Reason', 'Combatant', 'Consequence', 'Source']
Processing General information from ../data/json/Battle of Waterloo.json
	Split into 5 chunks
Processing Reason from ../data/json/Battle of Waterloo.json
	Split into 5 chunks
Processing Combatant from ../data/json/Battle of Waterloo.json
	Split into 23 chunks
Processing Consequence from ../data/json/Battle of Waterloo.json
	Split into 28 chunks
Processing Source from ../data/json/Battle of Waterloo.json
	Split into 1 chunks


# 2. Create Chunk node in KnowledgeGraph with properties extracted from file

In [5]:
# Create Napoleon_Chunk node and its properties
merge_chunk_node_query = """
MERGE(mergedChunk:Waterloo_Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET
        mergedChunk.text = $chunkParam.text, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.formItem = $chunkParam.formItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId
RETURN mergedChunk
"""
kg.query(merge_chunk_node_query, 
         params={'chunkParam':file_chunks[0]})

[{'mergedChunk': {'formItem': 'General information',
   'text': 'The Battle of Waterloo  was fought on Sunday 18 June 1815, near Waterloo (at that time in the United Kingdom of the Netherlands, now in Belgium), marking  the end of the Napoleonic Wars. A French army under the command of Napoleon was defeated by two armies of the Seventh Coalition. One of these was a British-led force with units from the United Kingdom, the Netherlands, Hanover, Brunswick, and Nassau, under the command of the Duke of Wellington (often referred to as the Anglo-allied army or Wellington\'s army). The other comprised three corps (the 1st, 2nd and 4th corps) of the Prussian army under Field Marshal Blücher; a fourth corps (the 3rd) of this army fought at the Battle of Wavre on the same day. The battle was known contemporarily as the Battle of Mont Saint-Jean in France (after the hamlet of Mont-Saint-Jean) and La Belle Alliance in Prussia ("the Beautiful Alliance"; after the inn of La Belle Alliance).\nUpon N

# 3. Create a uniqueness constraint to avoid duplicate chunks

In [7]:
#  Create a uniqueness constraint to avoid duplicate chunks
avoid_duplicate_chunks = """
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (nc:Waterloo_Chunk) REQUIRE nc.chunkId IS UNIQUE
"""

kg.query(avoid_duplicate_chunks)


[]

# 4. Adding all chunks data to knowledegeGraph

In [8]:
node_count = 0
for chunk in file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID Battle of Waterloo-General information-chunk0000
Creating `:Chunk` node for chunk ID Battle of Waterloo-General information-chunk0001
Creating `:Chunk` node for chunk ID Battle of Waterloo-General information-chunk0002
Creating `:Chunk` node for chunk ID Battle of Waterloo-General information-chunk0003
Creating `:Chunk` node for chunk ID Battle of Waterloo-General information-chunk0004
Creating `:Chunk` node for chunk ID Battle of Waterloo-Reason-chunk0000
Creating `:Chunk` node for chunk ID Battle of Waterloo-Reason-chunk0001
Creating `:Chunk` node for chunk ID Battle of Waterloo-Reason-chunk0002
Creating `:Chunk` node for chunk ID Battle of Waterloo-Reason-chunk0003
Creating `:Chunk` node for chunk ID Battle of Waterloo-Reason-chunk0004
Creating `:Chunk` node for chunk ID Battle of Waterloo-Combatant-chunk0000
Creating `:Chunk` node for chunk ID Battle of Waterloo-Combatant-chunk0001
Creating `:Chunk` node for chunk ID Battle of Waterloo-Combatant-

# 5. Create a VectorIndex for the Napoleon_Chunk

In [9]:
VectorIndex = """
         CREATE VECTOR INDEX `WaterlooOpenAI` IF NOT EXISTS
          FOR (nc:Napoleon_Chunk) ON (nc.textEmbeddingOpenAI) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
"""

kg.query(VectorIndex)


[]

# 6. Embedding the text data using a provider and add it to the Napoleon_Chunk node as a property named textEmbeddingOpenAI

In [10]:
kg.query("""
    MATCH (Waterloo_Chunk:Waterloo_Chunk) WHERE Waterloo_Chunk.textEmbeddingOpenAI IS NULL
    WITH Waterloo_Chunk, genai.vector.encode(
      Waterloo_Chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(Waterloo_Chunk, "textEmbeddingOpenAI", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

[]

# Optional: 6.1. If you want to Embed data using HuggingFace use this part

In [10]:
# # I am using T5 base but you can add more model in LLM file for yourself
# embedd_model = T5.t5_model()
# chunks = kg.query("MATCH (Napoleon_Chunk:Napoleon_Chunk) WHERE Napoleon_Chunk.textEmbedding IS NULL RETURN id(Napoleon_Chunk) as id, Napoleon_Chunk.text as text")


# for chunk in chunks:
#     text = chunk["text"]
#     print(f"Embedding Chunk {chunk['id']}")
#     vector = embedd_model.embed([text])

#     vector_list = vector.tolist()
#     print(vector_list)

#     kg.query(
#         """
#         MATCH (Napoleon_Chunk:Napoleon_Chunk) WHERE id(Napoleon_Chunk) = $id
#         SET Napoleon_Chunk.textEmbedding = $vector
#         """,
#         params={"id": chunk["id"], "vector": vector_list}
#     )

# 9. Finding all Chunks from the same formItem

In [12]:
# Each chunk is a small part of the document. To do this, first we need to find all chunks that belong together
cypher = """
   MATCH (from_same_chunk_item:Waterloo_Chunk)
    WHERE from_same_chunk_item.formItem = $WaterlooParam
    AND from_same_chunk_item.formItem = $WaterlooParam
  RETURN from_same_chunk_item {.text, .formItem, .chunkId, .chunkSeqId } as chunkItemInfo
    ORDER BY from_same_chunk_item.chunkSeqId ASC
    LIMIT 1
"""

items = ['Reason', 'Consequence', 'Combatant', 'General Information']
for item in items:
  result = kg.query(cypher, params={'WaterlooParam':item})
  print(f"{result}")

[{'chunkItemInfo': {'text': "On 13 March 1815, six days before Napoleon reached Paris, the powers at the Congress of Vienna declared him an outlaw. Four days later, the United Kingdom, Russia, Austria, and Prussia mobilised armies to defeat Napoleon. Critically outnumbered, Napoleon knew that once his attempts at dissuading one or more members of the Seventh Coalition from invading France had failed, his only chance of remaining in power was to attack before the coalition mobilised.\nHad Napoleon succeeded in destroying the existing coalition forces south of Brussels before they were reinforced, he might have been able to drive the British back to the sea and knock the Prussians out of the war. Crucially, this would have bought him time to recruit and train more men before turning his armies against the Austrians and Russians.\nAn additional consideration for Napoleon was that a French victory might cause French-speaking sympathisers in Belgium to launch a friendly revolution. Also, co

# 10. Adding NEXT relationship between chunks

In [13]:
# ordering based on chunkSeqId
cypher = """
  MATCH (n:Waterloo_Chunk)
  WHERE n.formItem = $WaterlooParam
  WITH n
    ORDER BY n.chunkSeqId ASC
  WITH collect(n) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )
  RETURN size(section_chunk_list)
"""

items = ['Reason', 'Consequence', 'Combatant', 'General Information']
for item in items:
  result = kg.query(cypher, params={'WaterlooParam':item})
  print(f"for {item}: {result}" )

for Reason: [{'size(section_chunk_list)': 5}]
for Consequence: [{'size(section_chunk_list)': 28}]
for Combatant: [{'size(section_chunk_list)': 23}]
for General Information: [{'size(section_chunk_list)': 0}]
