## Install dependent packages. For unstructured.io, we only install PDF related functions for now.

In [None]:

!pip install neo4j
!pip install unstructured
!pip install "unstructured[pdf]"


# Install common language packages from NLTK.

In [None]:
!pip install --user -U nltk
!python -m nltk.downloader popular

# Install **poppler** and **tesseract** which are required by pdf2image. The following lines are for linux and macos only. For Windows and other platforms, please refer to :
- https://pdf2image.readthedocs.io/en/latest/installation.html
- https://tesseract-ocr.github.io/tessdoc/Installation.html

In [None]:
!brew install poppler

In [None]:
!brew install tesseract


In [None]:

!pip install tesseract neo4j "unstructured[pdf]" python-dotenv 

In [17]:
from neo4j import GraphDatabase
import uuid
import hashlib
import os
from dotenv import load_dotenv

load_dotenv('.env', override=True)


NEO4J_URL = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

In [10]:

def initialiseNeo4j():
    cypher_schema = [
        "CREATE CONSTRAINT sectionKey IF NOT EXISTS FOR (c:Section) REQUIRE (c.key) IS UNIQUE;",
        "CREATE CONSTRAINT chunkKey IF NOT EXISTS FOR (c:Chunk) REQUIRE (c.key) IS UNIQUE;",
        "CREATE CONSTRAINT documentKey IF NOT EXISTS FOR (c:Document) REQUIRE (c.url_hash) IS UNIQUE;",
        "CREATE CONSTRAINT tableKey IF NOT EXISTS FOR (c:Table) REQUIRE (c.key) IS UNIQUE;",
        "CREATE CONSTRAINT elementKey IF NOT EXISTS FOR (c:Element) REQUIRE (c.key) IS UNIQUE;",
        "CALL db.index.vector.createNodeIndex('chunkVectorIndex', 'Embedding', 'value', 1536, 'COSINE');"
    ]

    driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))

    with driver.session() as session:
        for cypher in cypher_schema:
            session.run(cypher)
    driver.close()

def ingestDocumentNeo4j(elements, doc_location):

    cypher_pool = [
        # 0 - Document
        "MERGE (d:Document {url_hash: $doc_url_hash_val}) ON CREATE SET d.url = $doc_url_val, d.last_modified = $doc_last_modified_val RETURN d;",
        # 1 - Section
        "MERGE (p:Section {key: $element_id_val}) ON CREATE SET p:Element, p.page_idx = $page_idx_val, p.title_hash = $title_hash_val, p.block_idx = $block_idx_val, p.title = $title_val, p.tag = $tag_val RETURN p;",
        # 2 - Link Section with the Document
        "MATCH (d:Document {url_hash: $doc_url_hash_val}) MATCH (s:Section {key: $element_id_val}) MERGE (d)<-[:HAS_DOCUMENT]-(s);",
        # 3 - Link Section with a parent Element
        "MATCH (s1:Section {key: $element_id_val}) MATCH (s2:Element {key: $sec_parent_element_id_val}) MERGE (s2)<-[:UNDER_SECTION]-(s1);",
        # 4 - Chunk
        "MERGE (c:Chunk {key: $element_id_val}) ON CREATE SET c:Element, c.sentences = $sentences_val, c.sentences_hash = $sentences_hash_val, c.block_idx = $block_idx_val, c.page_idx = $page_idx_val, c.tag = $tag_val RETURN c;",
        # 5 - Link Chunk to another element
        "MATCH (c:Chunk {key: $element_id_val}) MATCH (s:Element {key:$chk_parent_element_id_val}) MERGE (s)<-[:HAS_PARENT]-(c);",
        # 6 - Table
        "MERGE (t:Table {key: $element_id_val}) ON CREATE SET t:Element, t.name = $name_val, t.doc_url_hash = $doc_url_hash_val, t.block_idx = $block_idx_val, t.page_idx = $page_idx_val, t.html = $html_val, t.rows = $rows_val RETURN t;",
        # 7 - Link Table to Section
        "MATCH (t:Table {key: $element_id_val}) MATCH (s:Section {key: $tb_parent_element_id_val}) MERGE (s)<-[:HAS_PARENT]-(t);",
        # 8 - Link Table to Document
        "MATCH (t:Table {key: $element_id_val}) MATCH (s:Document {url_hash: $doc_url_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);",
        # 9 - Image
        "MERGE (t:Image {key: $element_id_val}) ON CREATE SET t:Element, t.name = $name_val, t.doc_url_hash = $doc_url_hash_val, t.block_idx = $block_idx_val, t.page_idx = $page_idx_val RETURN t;",
        # 10 - Link Image to Document
        "MATCH (t:Image {key: $element_id_val}) MATCH (s:Document {url_hash: $doc_url_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);",
        # 11 - Link top Chunk to Document
        "MATCH (t:Chunk {key: $element_id_val}) MATCH (s:Document {url_hash: $doc_url_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);"
    ]

    driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))

    with driver.session() as session:
        cypher = ""

        # 1 - Create Document node
        doc_url_val = doc_location
        doc_url_hash_val = hashlib.md5(doc_url_val.encode("utf-8")).hexdigest()
        doc_last_modified_val = elements[0].metadata.last_modified

        cypher = cypher_pool[0]
        session.run(cypher, doc_url_hash_val=doc_url_hash_val, doc_url_val=doc_url_val, doc_last_modified_val=doc_last_modified_val)

        # 2 - Create Section nodes if element.category = 'Title'

        countSection = 0
        countChunk = 0
        countTable = 0
        countImage = 0

        # iterate all items in list elements and keep an index i
        for i, sec in enumerate(elements) :

            tag_val = sec.category
            page_idx_val = sec.metadata.page_number
            block_idx_val = i
            element_id_val = sec.id
            text_val = sec.text
            text_hash_val = hashlib.md5(text_val.encode("utf-8")).hexdigest()
            parent_id_val = str(sec.metadata.parent_id)

            if sec.category == 'Title':

                # MERGE section node
                cypher = cypher_pool[1]
                session.run(cypher, page_idx_val=page_idx_val
                                    , title_hash_val=text_hash_val
                                    , title_val=text_val
                                    , tag_val=tag_val
                                    , block_idx_val=block_idx_val
                                    , doc_url_hash_val=doc_url_hash_val
                                    , element_id_val=element_id_val
                        )

                # Link Section with a parent section or Document

                if parent_id_val == "None":    # use Document as parent
                    cypher = cypher_pool[2]
                    session.run(cypher
                                        , doc_url_hash_val=doc_url_hash_val
                                        , element_id_val=element_id_val
                        )

                else:   # use parent section
                    cypher = cypher_pool[3]
                    session.run(cypher
                                        , sec_parent_element_id_val=parent_id_val
                                        , doc_url_hash_val=doc_url_hash_val
                                        , element_id_val=element_id_val
                                )
                # **** if sec_parent_val == "None":

                countSection += 1
                continue
            # **** for sec in elements: category = 'Title'


        # ------- Continue within the session block -------
        # 3 - Create Chunk nodes from chunks

            if sec.category == 'NarrativeText' or sec.category == 'List' or sec.category == 'ListItem' \
                or sec.category == 'UncategorizedText' or sec.category == 'Header':


                # MERGE chunk node
                cypher = cypher_pool[4]
                session.run(cypher, sentences_hash_val=text_hash_val
                                    , sentences_val=text_val
                                    , block_idx_val=block_idx_val
                                    , page_idx_val=page_idx_val
                                    , tag_val=tag_val
                                    , doc_url_hash_val=doc_url_hash_val
                                    , element_id_val=element_id_val
                            )

                # Link chunk with a parent Element. If none, link it to Document

                if not parent_id_val == "None":

                    cypher = cypher_pool[5]
                    session.run(cypher
                                    , doc_url_hash_val=doc_url_hash_val
                                    , chk_parent_element_id_val=parent_id_val
                                    , element_id_val=element_id_val
                                )
                else:   # link chunk to Document
                    cypher = cypher_pool[11]
                    session.run(cypher
                                    , doc_url_hash_val=doc_url_hash_val
                                    , element_id_val=element_id_val
                                )

                countChunk += 1
                continue
            # **** for sec in elements: Chunk

            # 4 - Create Table nodes

            if sec.category == 'Table':

                html_val = sec.metadata.text_as_html
                # count <tr> in html
                rows_val = len(html_val.split('</tr>'))

                # MERGE table node

                cypher = cypher_pool[6]
                session.run(cypher, block_idx_val=block_idx_val
                                , page_idx_val=page_idx_val
                                , name_val=text_val
                                , html_val=html_val
                                , rows_val=rows_val
                                , doc_url_hash_val=doc_url_hash_val
                                , element_id_val=element_id_val
                            )

                # Link table with a section
                # Table always has a parent section

                if not parent_id_val == "None":
                    cypher = cypher_pool[7]
                    session.run(cypher
                                    , tb_parent_element_id_val=parent_id_val
                                    , element_id_val=element_id_val
                                )

                else:   # link table to Document
                    cypher = cypher_pool[8]
                    session.run(cypher
                                    , doc_url_hash_val=doc_url_hash_val
                                    , element_id_val=element_id_val
                                )
                countTable += 1
                continue
            # **** for sec in elements: category = 'Table'


        # 5 - Create Image nodes

            if sec.category == 'Image':

                # MERGE Image node

                cypher = cypher_pool[9]
                session.run(cypher, block_idx_val=block_idx_val
                                , page_idx_val=page_idx_val
                                , name_val=text_val
                                , doc_url_hash_val=doc_url_hash_val
                                , element_id_val=element_id_val
                            )

                # Link image with a section
                # Image always linkes to Document

                cypher = cypher_pool[10]
                session.run(cypher
                                , image_parent_element_id_val=doc_url_hash_val
                                , element_id_val=element_id_val
                                , doc_url_hash_val=doc_url_hash_val
                            )

                countImage += 1
                continue
            # **** for sec in elements: category = 'Image'
        # *** for i, sec in enumerate(elements) :

        print(f'\'{doc_url_val}\' Done! Summary: ')
        print('#Sections: ' + str(countSection))
        print('#Chunks: ' + str(countChunk))
        print('#Tables: ' + str(countTable))
        print('#Images: ' + str(countImage))

    # *** with driver.session() as session:

    driver.close()


# *** def ingestDocumentNeo4j(elements, doc_location):


In [None]:

# create constraints and indexes. only need to execute once.

initialiseNeo4j()

In [12]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import convert_to_dict
from unstructured.staging.base import elements_to_json


doc_location = "data"   # replace this to your document location
doc_file_name = "EU AI ACT.pdf" # replace this to your document file name
doc_url = doc_location + "/" + doc_file_name

# partition the pdf into elements

elements = partition_pdf(filename=doc_location+"/"+doc_file_name, infer_table_structure=True)

ingestDocumentNeo4j(elements, doc_url)

# DONE


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

'data/EU AI ACT.pdf' Done! Summary: 
#Sections: 397
#Chunks: 2708
#Tables: 0
#Images: 1


In [13]:
# save the elements as a json file

convert_to_dict(elements)

filename = doc_location+"/"+doc_file_name+".json"
elements_to_json(elements, filename=filename)


In [None]:
from tqdm import tqdm
from neo4j import GraphDatabase
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from time import sleep

"""
LoadEmbedding: call embedding API to generate embeddings for each property of node in Neo4j
Version: 1.1
"""

e = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)


def LoadEmbedding(label, property):
    driver = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)

    with driver.session() as session:
        # get chunks in document, together with their section titles
        result = session.run(f"MATCH (ch:{label}) -[:HAS_PARENT]-> (s:Section) RETURN id(ch) AS id, s.title + ' >> ' + ch.{property} AS text")
        # call OpenAI embedding API to generate embeddings for each proporty of node
        # for each node, update the embedding property
        count = 0
        for record in tqdm(result):
            id = record["id"]
            text = record["text"]
            
            # For better performance, text can be batched
            embedding = e.embed_query(text)
            sleep(0.1)
            
            # key property of Embedding node differentiates different embeddings
            cypher = "CREATE (e:Embedding) SET e.key=$key, e.value=$embedding"
            cypher = cypher + " WITH e MATCH (n) WHERE id(n) = $id CREATE (n) -[:HAS_EMBEDDING]-> (e)"
            session.run(cypher,key=property, embedding=embedding, id=id )
            count = count + 1

        session.close()
        
        print("Processed " + str(count) + " " + label + " nodes for property @" + property + ".")
        return count
    


LoadEmbedding("Chunk", "sentences")

LoadEmbedding("Table", "name")