# Lesson 4: Constructing a Knowledge Graph from Text Documents

<p style="background-color:#fd4a6180; padding:15px; margin-left:20px"> <b>Note:</b> This notebook takes about 30 seconds to be ready to use. Please wait until the "Kernel starting, please wait..." message clears from the top of the notebook before running any cells. You may start the video while you wait.</p>

### Import packages and set up Neo4j

In [1]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

from tqdm import tqdm

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'

#OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
#OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

# Global constants
VECTOR_INDEX_NAME = 'pdf_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'embedding'

In [30]:
pdf_file = 'data/EU AI ACT.pdf'
pdf_elements = partition_pdf(pdf_file)

In [31]:
for element in pdf_elements[:10]:
    print(f"{element.category.upper()}: {element.metadata.to_dict()}: {element.text}")

TITLE: {'coordinates': {'points': ((70.90000153, 77.57300124999995), (70.90000153, 109.08000099000003), (202.14800153, 109.08000099000003), (202.14800153, 77.57300124999995)), 'system': 'PixelSpace', 'layout_width': 595.29998779, 'layout_height': 841.90002441}, 'file_directory': 'data', 'filename': 'EU AI ACT.pdf', 'languages': ['eng'], 'last_modified': '2024-04-30T21:54:08', 'page_number': 1, 'filetype': 'application/pdf'}: European Parliament 2019-2024
TITLE: {'coordinates': {'points': ((251.50699616, 139.94899961999988), (251.50699616, 150.94899961999988), (343.78599616, 150.94899961999988), (343.78599616, 139.94899961999988)), 'system': 'PixelSpace', 'layout_width': 595.29998779, 'layout_height': 841.90002441}, 'file_directory': 'data', 'filename': 'EU AI ACT.pdf', 'languages': ['eng'], 'last_modified': '2024-04-30T21:54:08', 'page_number': 1, 'filetype': 'application/pdf'}: TEXTS ADOPTED
UNCATEGORIZEDTEXT: {'coordinates': {'points': ((70.90000153, 187.74199851000003), (70.90000153

In [4]:
pdf_file = 'data/EU AI ACT.pdf'

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

def split_pdf_data_from_file(pdf_file):
    chunks_with_metadata = [] # accumlate chunk records

    pdf_elements = partition_pdf(pdf_file)
    elements = chunk_by_title(pdf_elements)

    chunk_seq_id = 0
    for element in tqdm(elements):
        if len(element.text) < 5:
            continue

        chunks = text_splitter.split_text(element.text)

        for chunk in chunks:
            chunks_with_metadata.append({
                'text': chunk,
                'category': element.category,
                'chunk_id': chunk_seq_id,
                'page_number': element.metadata.to_dict()['page_number']
            })
            chunk_seq_id += 1


    return chunks_with_metadata


chunks = split_pdf_data_from_file(pdf_file)
len(chunks)

100%|██████████| 1526/1526 [00:01<00:00, 936.00it/s] 


1526

In [5]:
chunks[0]

{'text': 'European Parliament 2019-2024\n\nTEXTS ADOPTED\n\nP9_TA(2024)0138\n\nArtificial Intelligence Act\n\nEuropean Parliament legislative resolution of 13 March 2024 on the proposal for a regulation of the European Parliament and of the Council on laying down harmonised rules on Artificial Intelligence (Artificial Intelligence Act) and amending certain Union Legislative Acts (COM(2021)0206 – C9-0146/2021 – 2021/0106(COD))',
 'category': 'CompositeElement',
 'chunk_id': 0,
 'page_number': 1}

### Create graph nodes using text chunks

In [6]:
kg = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

In [7]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunk_id})
    ON CREATE SET 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.category = $chunkParam.category,
        mergedChunk.page_number = $chunkParam.page_number
RETURN mergedChunk
"""

kg.query(merge_chunk_node_query, params={'chunkParam':chunks[0]})

[{'mergedChunk': {'page_number': 1,
   'text': 'European Parliament 2019-2024\n\nTEXTS ADOPTED\n\nP9_TA(2024)0138\n\nArtificial Intelligence Act\n\nEuropean Parliament legislative resolution of 13 March 2024 on the proposal for a regulation of the European Parliament and of the Council on laying down harmonised rules on Artificial Intelligence (Artificial Intelligence Act) and amending certain Union Legislative Acts (COM(2021)0206 – C9-0146/2021 – 2021/0106(COD))',
   'category': 'CompositeElement',
   'chunkId': 0}}]

In [8]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE
""")

kg.query("SHOW INDEXES")

[{'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None},
 {'id': 3,
  'name': 'unique_chunk',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['chunk_id'],
  'indexProvider': 'range-1.0',
  'owningConstraint': 'unique_chunk'}]

In [None]:
node_count = 0
for chunk in tqdm(chunks):
    # print(f"Creating `:Chunk` node for chunk ID {chunk['chunk_id']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

In [11]:
kg.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)

[{'nodeCount': 1526}]

### Create a vector index

In [30]:
kg.query("""
    CREATE VECTOR INDEX `pdf_chunks` IF NOT EXISTS
    FOR (c:Chunk) ON (c.embedding) 
    OPTIONS { indexConfig: {
      `vector.dimensions`: 1536,
      `vector.similarity_function`: 'cosine'    
    }}
""")

ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'VECTOR': expected "(", "allShortestPaths" or "shortestPath" (line 2, column 8 (offset: 12))
"    CREATE VECTOR INDEX `pdf_chunks` IF NOT EXISTS"
            ^}

In [None]:
kg.query("SHOW INDEXES")

### Calculate embedding vectors for chunks and populate index
- This query calculates the embedding vector and stores it as a property called `textEmbedding` on each `Chunk` node.

In [None]:
kg.query("""
    MATCH (chunk:Chunk) WHERE chunk.embedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

In [None]:
kg.refresh_schema()
print(kg.schema)

### Use similarity search to find relevant chunks

- Setup a help function to perform similarity search using the vector index

In [None]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 10})
  return similar

- Ask a question!

In [None]:
search_results = neo4j_vector_search(
    'In a single sentence, tell me about EU AI ACT.'
)

In [None]:
search_results[0]

### Set up a LangChain RAG workflow to chat with the form

In [None]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)


In [None]:
retriever = neo4j_vector_store.as_retriever()

In [None]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), 
    chain_type="stuff", 
    retriever=retriever
)

In [None]:
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

In [None]:
prettychain("""
    Tell me about EU AI ACT. 
    Limit your answer to a single sentence.
""")