# Lesson 4: Constructing a Knowledge Graph from Text Documents

<p style="background-color:#fd4a6180; padding:15px; margin-left:20px"> <b>Note:</b> This notebook takes about 30 seconds to be ready to use. Please wait until the "Kernel starting, please wait..." message clears from the top of the notebook before running any cells. You may start the video while you wait.</p>

### Import packages and set up Neo4j

In [1]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

from tqdm import tqdm

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [33]:
# Load from environment
load_dotenv('.env', override=True)

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'

GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

VECTOR_INDEX_NAME = 'pdf_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'embedding'

In [30]:
pdf_file = 'data/EU AI ACT.pdf'
pdf_elements = partition_pdf(pdf_file)

In [31]:
for element in pdf_elements[:10]:
    print(f"{element.category.upper()}: {element.metadata.to_dict()}: {element.text}")

TITLE: {'coordinates': {'points': ((70.90000153, 77.57300124999995), (70.90000153, 109.08000099000003), (202.14800153, 109.08000099000003), (202.14800153, 77.57300124999995)), 'system': 'PixelSpace', 'layout_width': 595.29998779, 'layout_height': 841.90002441}, 'file_directory': 'data', 'filename': 'EU AI ACT.pdf', 'languages': ['eng'], 'last_modified': '2024-04-30T21:54:08', 'page_number': 1, 'filetype': 'application/pdf'}: European Parliament 2019-2024
TITLE: {'coordinates': {'points': ((251.50699616, 139.94899961999988), (251.50699616, 150.94899961999988), (343.78599616, 150.94899961999988), (343.78599616, 139.94899961999988)), 'system': 'PixelSpace', 'layout_width': 595.29998779, 'layout_height': 841.90002441}, 'file_directory': 'data', 'filename': 'EU AI ACT.pdf', 'languages': ['eng'], 'last_modified': '2024-04-30T21:54:08', 'page_number': 1, 'filetype': 'application/pdf'}: TEXTS ADOPTED
UNCATEGORIZEDTEXT: {'coordinates': {'points': ((70.90000153, 187.74199851000003), (70.90000153

In [3]:
pdf_file = 'data/EU AI ACT.pdf'

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

def split_pdf_data_from_file(pdf_file):
    chunks_with_metadata = [] # accumlate chunk records

    pdf_elements = partition_pdf(pdf_file)
    elements = chunk_by_title(pdf_elements)

    chunk_seq_id = 0
    for element in tqdm(elements):
        if len(element.text) < 5:
            continue

        chunks = text_splitter.split_text(element.text)

        for chunk in chunks:
            chunks_with_metadata.append({
                'text': chunk,
                'source': pdf_file,
                'category': element.category,
                'chunk_id': chunk_seq_id,
                'page_number': element.metadata.to_dict()['page_number']
            })
            chunk_seq_id += 1


    return chunks_with_metadata


chunks = split_pdf_data_from_file(pdf_file)
len(chunks)

100%|██████████| 1526/1526 [00:01<00:00, 1152.85it/s]


1526

In [4]:
chunks[0]

{'text': 'European Parliament 2019-2024\n\nTEXTS ADOPTED\n\nP9_TA(2024)0138\n\nArtificial Intelligence Act\n\nEuropean Parliament legislative resolution of 13 March 2024 on the proposal for a regulation of the European Parliament and of the Council on laying down harmonised rules on Artificial Intelligence (Artificial Intelligence Act) and amending certain Union Legislative Acts (COM(2021)0206 – C9-0146/2021 – 2021/0106(COD))',
 'category': 'CompositeElement',
 'chunk_id': 0,
 'page_number': 1}

### Create graph nodes using text chunks

In [14]:
kg = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

In [7]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunk_id})
    ON CREATE SET 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.category = $chunkParam.category,
        mergedChunk.page_number = $chunkParam.page_number
RETURN mergedChunk
"""

kg.query(merge_chunk_node_query, params={'chunkParam':chunks[0]})

[{'mergedChunk': {'page_number': 1,
   'text': 'European Parliament 2019-2024\n\nTEXTS ADOPTED\n\nP9_TA(2024)0138\n\nArtificial Intelligence Act\n\nEuropean Parliament legislative resolution of 13 March 2024 on the proposal for a regulation of the European Parliament and of the Council on laying down harmonised rules on Artificial Intelligence (Artificial Intelligence Act) and amending certain Union Legislative Acts (COM(2021)0206 – C9-0146/2021 – 2021/0106(COD))',
   'category': 'CompositeElement',
   'chunkId': 0}}]

In [8]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE
""")

kg.query("SHOW INDEXES")

[{'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None},
 {'id': 3,
  'name': 'unique_chunk',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['chunk_id'],
  'indexProvider': 'range-1.0',
  'owningConstraint': 'unique_chunk'}]

In [None]:
node_count = 0
for chunk in tqdm(chunks):
    # print(f"Creating `:Chunk` node for chunk ID {chunk['chunk_id']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

In [15]:
kg.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)

[{'nodeCount': 1526}]

### Create a vector index

In [35]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)

In [None]:
# Create a Neo4jVector instance
vector_store = Neo4jVector(
    embedding=embeddings,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL
)

# Create the vector index
vector_store.create_new_index()

# Add the chunks to the vector index
for i, record in enumerate(tqdm(chunks)):
    properties = {
        'text': record['text'],
        'category': record['category'],
        'chunk_id': record['chunk_id'],
        'page_number': record['page_number']
    }
    vector_store.add_texts(properties)

print(f"Added {len(chunks)} nodes to the vector index")

In [49]:
print(kg.query("SHOW INDEXES"))

[{'id': 1, 'name': 'index_343aff4e', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'LOOKUP', 'entityType': 'NODE', 'labelsOrTypes': None, 'properties': None, 'indexProvider': 'token-lookup-1.0', 'owningConstraint': None, 'lastRead': neo4j.time.DateTime(2024, 5, 1, 21, 4, 6, 186000000, tzinfo=<UTC>), 'readCount': 693}, {'id': 2, 'name': 'index_f7700477', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'LOOKUP', 'entityType': 'RELATIONSHIP', 'labelsOrTypes': None, 'properties': None, 'indexProvider': 'token-lookup-1.0', 'owningConstraint': None, 'lastRead': None, 'readCount': 0}, {'id': 5, 'name': 'pdf_chunks', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'VECTOR', 'entityType': 'NODE', 'labelsOrTypes': ['Chunk'], 'properties': ['embedding'], 'indexProvider': 'vector-1.0', 'owningConstraint': None, 'lastRead': neo4j.time.DateTime(2024, 5, 1, 20, 59, 34, 888000000, tzinfo=<UTC>), 'readCount': 2}, {'id': 3, 'name': 'unique_chunk', 'state': 'ONLINE', 'populationPer

In [50]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {category: STRING, chunkId: INTEGER, text: STRING, page_number: INTEGER, embedding: LIST, id: STRING}
Relationship properties:

The relationships:



### Set up a LangChain RAG workflow to chat with the form

In [52]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    #embedding=OpenAIEmbeddings(),
    embedding=embeddings,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)


GoogleGenerativeAIError: Error embedding content: 429 Quota exceeded for quota metric 'Batch Embed Content API requests' and limit 'Batch embed contents request limit per minute for a region' of service 'generativelanguage.googleapis.com' for consumer 'project_number:274678803106'. [reason: "RATE_LIMIT_EXCEEDED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "quota_metric"
  value: "generativelanguage.googleapis.com/batch_embed_contents_requests"
}
metadata {
  key: "quota_location"
  value: "us-east2"
}
metadata {
  key: "quota_limit"
  value: "BatchEmbedContentsRequestsPerMinutePerProjectPerRegion"
}
metadata {
  key: "quota_limit_value"
  value: "150"
}
metadata {
  key: "consumer"
  value: "projects/274678803106"
}
, links {
  description: "Request a higher quota limit."
  url: "https://cloud.google.com/docs/quota#requesting_higher_quota"
}
]

In [42]:
retriever = neo4j_vector_store.as_retriever()

In [43]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.7, top_p=0.85, google_api_key=GEMINI_API_KEY)

In [44]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    #ChatOpenAI(temperature=0), 
    model,
    chain_type="stuff", 
    retriever=retriever
)

In [45]:
def prettychain(question: str) -> str:
    response = chain({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

In [46]:
prettychain("""
    Tell me about EU AI ACT. 
    Limit your answer to a single sentence.
""")

ValueError: Document prompt requires documents to have metadata variables: ['source']. Received document with missing metadata: ['source'].