In [1]:
from dotenv import load_dotenv
import os

# Langchain
from langchain_community.graphs import Neo4jGraph
import chunking
# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'


kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

### 1. Splitting the Chunks

In [3]:
#  Load JSON file
file = "../data/json/Talleyrand.json"

In [4]:
file_chunks = chunking.split_data_from_file(file)

['General Information', 'Career', 'Death', 'Source']
Processing General Information from ../data/json/Talleyrand.json
	Split into 12 chunks
Processing Career from ../data/json/Talleyrand.json
	Split into 13 chunks
Processing Death from ../data/json/Talleyrand.json
	Split into 1 chunks
Processing Source from ../data/json/Talleyrand.json
	Split into 1 chunks


### 2. Create Chunk node in KnowledgeGraph with properties extracted from file

In [5]:
# Create Napoleon_Chunk node and its properties
merge_chunk_node_query = """
MERGE(mergedChunk:Talleyrand_Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET
        mergedChunk.text = $chunkParam.text, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.formItem = $chunkParam.formItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId
RETURN mergedChunk
"""
kg.query(merge_chunk_node_query, 
         params={'chunkParam':file_chunks[0]})

[{'mergedChunk': {'formItem': 'General Information',
   'text': 'Charles-Maurice de Talleyrand-Périgord (/ˌtælɪrænd ˈpɛrɪɡɔːr/, French: [ʃaʁl mɔʁis də tal(ɛ)ʁɑ̃ peʁiɡɔʁ, – moʁ-]; 2 February 1754 – 17 May 1838), 1st Prince of Benevento, then Prince of Talleyrand, was a French secularized clergyman, statesman, and leading diplomat. After studying theology, he became Agent-General of the Clergy in 1780. In 1789, just before the French Revolution, he became Bishop of Autun. He worked at the highest levels of successive French governments, most commonly as foreign minister or in some other diplomatic capacity. His career spanned the regimes of Louis XVI, the years of the French Revolution, Napoleon, Louis XVIII, and Louis Philippe I. Those Talleyrand served often distrusted him but, like Napoleon, found him extremely useful. The name "Talleyrand" has become a byword for crafty and cynical diplomacy.\nHe was Napoleon\'s chief diplomat during the years when French military victories brought o

### 3. Create a uniqueness constraint to avoid duplicate chunks

In [6]:
#  Create a uniqueness constraint to avoid duplicate chunks
avoid_duplicate_chunks = """
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (tc:Talleyrand_Chunk) REQUIRE tc.chunkId IS UNIQUE
"""

kg.query(avoid_duplicate_chunks)


[]

### 4. Adding all chunks data to knowledegeGraph

In [7]:
node_count = 0
for chunk in file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0000
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0001
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0002
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0003
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0004
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0005
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0006
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0007
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0008
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0009
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0010
Creating `:Chunk` node for chunk ID Talleyrand-General Information-chunk0011
Creating `:Chunk` node for chunk ID Talleyrand-Career-chunk0000
Creating `:C

### 5. Create a VectorIndex for the Talleyrand_Chunk

In [8]:
VectorIndex = """
         CREATE VECTOR INDEX `TalleyrandOpenAI` IF NOT EXISTS
          FOR (nc:Napoleon_Chunk) ON (nc.textEmbeddingOpenAI) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
"""

kg.query(VectorIndex)


[]

### 6. Embedding the text data using a provider and add it to the Talleyrand_Chunk node as a property named textEmbeddingOpenAI

In [9]:
kg.query("""
    MATCH (Talleyrand_Chunk:Talleyrand_Chunk) WHERE Talleyrand_Chunk.textEmbeddingOpenAI IS NULL
    WITH Talleyrand_Chunk, genai.vector.encode(
      Talleyrand_Chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(Talleyrand_Chunk, "textEmbeddingOpenAI", vector) 
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

[]

### Optional: 6.1. If you want to Embed data using HuggingFace use this part

In [10]:
# # I am using T5 base but you can add more model in LLM file for yourself
# embedd_model = T5.t5_model()
# chunks = kg.query("MATCH (Napoleon_Chunk:Napoleon_Chunk) WHERE Talleyrand_Chunk.textEmbedding IS NULL RETURN id(Napoleon_Chunk) as id, Napoleon_Chunk.text as text")


# for chunk in chunks:
#     text = chunk["text"]
#     print(f"Embedding Chunk {chunk['id']}")
#     vector = embedd_model.embed([text])

#     vector_list = vector.tolist()
#     print(vector_list)

#     kg.query(
#         """
#         MATCH (Napoleon_Chunk:Napoleon_Chunk) WHERE id(Napoleon_Chunk) = $id
#         SET Napoleon_Chunk.textEmbedding = $vector
#         """,
#         params={"id": chunk["id"], "vector": vector_list}
#     )

### 9. Finding all Chunks from the same formItem

In [11]:
# Each chunk is a small part of the document. To do this, first we need to find all chunks that belong together
cypher = """
   MATCH (from_same_chunk_item:Talleyrand_Chunk)
    WHERE from_same_chunk_item.formItem = $TalleyrandParam
    AND from_same_chunk_item.formItem = $TalleyrandParam
  RETURN from_same_chunk_item {.text, .formItem, .chunkId, .chunkSeqId } as chunkItemInfo
    ORDER BY from_same_chunk_item.chunkSeqId ASC
    LIMIT 1
"""

items = ['General Information', 'Career', 'Death']
for item in items:
  kg.query(cypher, params={'TalleyrandParam':item})
  print(f"{item} : {kg.query(cypher, params={'TalleyrandParam':item})}")

General Information : [{'chunkItemInfo': {'text': 'Charles-Maurice de Talleyrand-Périgord (/ˌtælɪrænd ˈpɛrɪɡɔːr/, French: [ʃaʁl mɔʁis də tal(ɛ)ʁɑ̃ peʁiɡɔʁ, – moʁ-]; 2 February 1754 – 17 May 1838), 1st Prince of Benevento, then Prince of Talleyrand, was a French secularized clergyman, statesman, and leading diplomat. After studying theology, he became Agent-General of the Clergy in 1780. In 1789, just before the French Revolution, he became Bishop of Autun. He worked at the highest levels of successive French governments, most commonly as foreign minister or in some other diplomatic capacity. His career spanned the regimes of Louis XVI, the years of the French Revolution, Napoleon, Louis XVIII, and Louis Philippe I. Those Talleyrand served often distrusted him but, like Napoleon, found him extremely useful. The name "Talleyrand" has become a byword for crafty and cynical diplomacy.\nHe was Napoleon\'s chief diplomat during the years when French military victories brought one European st

### 10. Adding NEXT relationship

In [13]:
# ordering based on chunkSeqId
cypher = """
  MATCH (from_same_chunk_item:Talleyrand_Chunk)
  WHERE from_same_chunk_item.formItem = $TalleyrandParam
    AND from_same_chunk_item.formItem = $TalleyrandParam
  WITH from_same_chunk_item
    ORDER BY from_same_chunk_item.chunkSeqId ASC
  WITH collect(from_same_chunk_item) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )
  RETURN size(section_chunk_list)
"""

items = ['General Information', 'Career', 'Death']
for item in items:
  kg.query(cypher, params={'TalleyrandParam':item})
  print(f"for {item}: {kg.query(cypher, params={'TalleyrandParam':item})}" )

for General Information: [{'size(section_chunk_list)': 12}]
for Career: [{'size(section_chunk_list)': 13}]
for Death: [{'size(section_chunk_list)': 1}]
