### Write Documents to Neo4j using haystack integration

In [1]:
from pathlib import Path
from haystack.components.converters import TextFileToDocument

converter = TextFileToDocument()

docs = converter.run(sources=[Path("Dinosaur.html")])
print(docs)

  from .autonotebook import tqdm as notebook_tqdm


{'documents': [Document(id=1841e79f27771b7e59691e069d4b42204bbe5ea1093e15aa20aeeb86127557ec, content: '<link
  rel="mw-deduplicated-inline-style"
  href="mw-data:TemplateStyles:r1236090951"
/>
<p class="...', meta: {'file_path': 'Dinosaur.html'})]}


In [2]:
from bs4 import BeautifulSoup
import re

def WikiPageChunks(html_str: str) -> list:
    soup = BeautifulSoup(html_str, 'html.parser')  # Parse the HTML content

    chunks = []

    def clean_text(text):
        cleaned_text = text.replace('\n', ' ').replace('\xa0', ' ')  # Replace newlines and non-breaking spaces with regular spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        cleaned_text = re.sub(r'(\d)\s+(\d)', r'\1\2', cleaned_text)  # Remove spaces between digits
        cleaned_text = cleaned_text.strip()  # Remove leading and trailing spaces
        return cleaned_text

    html_soup = soup.body or soup  # Use the body of the HTML if it exists, otherwise use the whole soup
    nested = ['ul', 'ol', 'dl', 'li', 'dt', 'dd']  # Tags that represent nested lists
    for tag in html_soup.find_all(recursive=False):  # Iterate over top-level tags in the HTML
        if tag.name == 'p':
            chunks.append(clean_text(tag.get_text(separator=' ')))  # Clean and add paragraph text to chunks
        elif tag.name == 'link':
            continue  # Skip link tags
        elif tag.name in nested:
            list_items = tag.find_all('li')  # Find all list items
            for li in list_items:
                chunks.append(clean_text(li.get_text(separator=' ')))  # Clean and add each list item text to chunks
        else:
            chunks.append(str(tag))  # Add other tags as strings
    
    return chunks

In [3]:
html_content = docs["documents"][0].content

In [4]:
from typing import List, Dict, Any
import uuid
from haystack import Document
from haystack import component

class Chunk:
    def __init__(self, id: str, next: str = None):
        self.id = id
        self.next = next

class Section:
    def __init__(self, name: str, type: str):
        self.name = name
        self.type = type
        self.chunks = []
        self.sections = []
        self.first_chunk = None

class Page:
    def __init__(self, title: str):
        self.title = title
        self.sections = []
        self.chunks = []
        self.first_chunk = None

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    - The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    - Each chunk is a paragraph, list, or table in the Wikipedia page.
    - Each chunk is stored as a separate document with text in 'content' field
    - Each chunk also stores title, h2, h3 etc in meta field.
    - Custom component also creates a hierarchical structure of the chunks based on title, h2, h3 etc.
    """
    @component.output_types(documents=List[Document], hierarchy=dict)
    def run(self, documents: List[Document]):
        chunks = []
        hierarchy = {}
        
        for doc in documents:
            page_title = doc.meta["file_path"].replace(".html", "")
            page = Page(page_title)
            
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            i = 0
            current_h2 = ""
            current_h3 = ""
            current_h4 = ""
            current_section = None
            current_sub_section = None
            current_sub_sub_section = None

            for chunk in page_chunks:
                if chunk == "":
                    continue
                if chunk.startswith("<h2>"):
                    current_h2 = chunk[4:-5]  # Extract text between <h2> and </h2>
                    current_h3 = ""  # Reset h3 when a new h2 is found
                    current_h4 = ""  # Reset h4 when a new h2 is found
                    current_section = Section(current_h2, "h2")
                    page.sections.append(current_section)
                    current_sub_section = None
                    current_sub_sub_section = None
                elif chunk.startswith("<h3>"):
                    current_h3 = chunk[4:-5]  # Extract text between <h3> and </h3>
                    current_h4 = ""  # Reset h4 when a new h3 is found
                    if current_section:
                        current_sub_section = Section(current_h3, "h3")
                        current_section.sections.append(current_sub_section)
                        current_sub_sub_section = None
                elif chunk.startswith("<h4>"):
                    current_h4 = chunk[4:-5]  # Extract text between <h4> and </h4>
                    if current_sub_section:
                        current_sub_sub_section = Section(current_h4, "h4")
                        current_sub_section.sections.append(current_sub_sub_section)
                else:
                    meta = {
                        "file_path": doc.meta["file_path"],
                        "source_id": doc.id,
                        "split_id": i,
                        "title": "Dinosaurs"
                    }
                    if current_h2:
                        meta["h2"] = current_h2
                    if current_h3:
                        meta["h3"] = current_h3
                    if current_h4:
                        meta["h4"] = current_h4

                    chunk_obj = Chunk(str(uuid.uuid4()))
                    chunks.append(
                        Document(
                            id=chunk_obj.id,
                            content=chunk,
                            meta=meta
                        )
                    )
                    if current_sub_sub_section:
                        current_sub_sub_section.chunks.append(chunk_obj)
                        if current_sub_sub_section.first_chunk is None:
                            current_sub_sub_section.first_chunk = chunk_obj
                    elif current_sub_section:
                        current_sub_section.chunks.append(chunk_obj)
                        if current_sub_section.first_chunk is None:
                            current_sub_section.first_chunk = chunk_obj
                    elif current_section:
                        current_section.chunks.append(chunk_obj)
                        if current_section.first_chunk is None:
                            current_section.first_chunk = chunk_obj
                    else:
                        page.chunks.append(chunk_obj)
                        if page.first_chunk is None:
                            page.first_chunk = chunk_obj
                    i += 1
            
            self.set_next_chunks(page)
            hierarchy[page_title] = self.page_to_dict(page)
        
        return {"documents": chunks, "hierarchy": hierarchy}

    def set_next_chunks(self, page: Page):
        for section in page.sections:
            self.set_next_chunks_in_section(section)
        self.set_next_in_list(page.chunks)

    def set_next_chunks_in_section(self, section: Section):
        for sub_section in section.sections:
            self.set_next_chunks_in_section(sub_section)
        self.set_next_in_list(section.chunks)

    def set_next_in_list(self, chunks: List[Chunk]):
        for i in range(len(chunks) - 1):
            chunks[i].next = chunks[i + 1].id
        if chunks:
            chunks[-1].next = None

    def page_to_dict(self, page: Page) -> Dict[str, Any]:
        return {
            "title": page.title,
            "sections": [self.section_to_dict(section) for section in page.sections],
            "chunks": [self.chunk_to_dict(chunk) for chunk in page.chunks],
            "first_chunk": self.chunk_to_dict(page.first_chunk) if page.first_chunk else None
        }

    def section_to_dict(self, section: Section) -> Dict[str, Any]:
        return {
            "name": section.name,
            "type": section.type,
            "chunks": [self.chunk_to_dict(chunk) for chunk in section.chunks],
            "sections": [self.section_to_dict(sub_section) for sub_section in section.sections],
            "first_chunk": self.chunk_to_dict(section.first_chunk) if section.first_chunk else None
        }

    def chunk_to_dict(self, chunk: Chunk) -> Dict[str, Any]:
        return {
            "id": chunk.id,
            "next": chunk.next
        }

In [5]:
splitter = WikiPageChunker()

chunk_result = splitter.run(docs["documents"])

chunk_result["documents"]

[Document(id=f7fa2921-9f2c-48b0-82d3-b2d5a99bf35f, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': 'Dinosaur.html', 'source_id': '1841e79f27771b7e59691e069d4b42204bbe5ea1093e15aa20aeeb86127557ec', 'split_id': 0, 'title': 'Dinosaurs'}),
 Document(id=f059e2c4-f7ba-4766-ae1c-3704d74217d0, content: 'Dinosaurs are varied from taxonomic, morphological and ecological standpoints. Birds, at over 11,000...', meta: {'file_path': 'Dinosaur.html', 'source_id': '1841e79f27771b7e59691e069d4b42204bbe5ea1093e15aa20aeeb86127557ec', 'split_id': 1, 'title': 'Dinosaurs'}),
 Document(id=e8d9734e-3637-402c-9cbc-fe17fe6eaa57, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'file_path': 'Dinosaur.html', 'source_id': '1841e79f27771b7e59691e069d4b42204bbe5ea1093e15aa20aeeb86127557ec', 'split_id': 2, 'title': 'Dinosaurs'}),
 Document(id=cb9048a1-5f85-4

In [7]:
hierarchy = chunk_result["hierarchy"]

hierarchy

{'Dinosaur': {'title': 'Dinosaur',
  'sections': [{'name': 'Definition',
    'type': 'h2',
    'chunks': [{'id': '3de69bd0-387d-4564-b970-bc49ff198e5b',
      'next': 'c4fb77ea-688d-4139-b6a7-17f19e975ef8'},
     {'id': 'c4fb77ea-688d-4139-b6a7-17f19e975ef8',
      'next': 'fa3eab62-38be-48e0-b983-4b577efcfa7e'},
     {'id': 'fa3eab62-38be-48e0-b983-4b577efcfa7e', 'next': None}],
    'sections': [{'name': 'General description',
      'type': 'h3',
      'chunks': [{'id': '06ed7687-dbe6-4d97-8a5b-2fe5d070d50d',
        'next': '4aea36f4-d9a3-4d26-9625-4d6fb7d87342'},
       {'id': '4aea36f4-d9a3-4d26-9625-4d6fb7d87342',
        'next': 'be464d1a-3c3f-42a2-915e-8db59805e877'},
       {'id': 'be464d1a-3c3f-42a2-915e-8db59805e877', 'next': None}],
      'sections': [],
      'first_chunk': {'id': '06ed7687-dbe6-4d97-8a5b-2fe5d070d50d',
       'next': '4aea36f4-d9a3-4d26-9625-4d6fb7d87342'}}],
    'first_chunk': {'id': '3de69bd0-387d-4564-b970-bc49ff198e5b',
     'next': 'c4fb77ea-688d-4139

Option 1: use neo4j_haystack driver

In [None]:
from neo4j_haystack.client.neo4j_client import Neo4jClientConfig
from neo4j_haystack.components.neo4j_query_writer import Neo4jQueryWriter

client_config = Neo4jClientConfig("bolt://localhost:7687", database="neo4j", username="neo4j", password="neo4jpass")


writer = Neo4jQueryWriter(client_config=client_config, verify_connectivity=True, runtime_parameters=["hierarchy"])

result = writer.run(
    query=(
        # Create cypher query to create nodes and relationships from hierarchy
    ),
    hierarchy=hierarchy
)

Option 2: custom logic to create graph (seems easier)

In [8]:
from neo4j import GraphDatabase

class Neo4jGraphCreator:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_graph(self, page_dict):
        with self.driver.session() as session:
            # Create Page node with UUID
            page_uuid = str(uuid.uuid4())
            page_query = """
            MERGE (p:Page {title: $title})
            ON CREATE SET p.uuid = $uuid
            RETURN p
            """
            session.run(page_query, title=page_dict['title'], uuid=page_uuid)

            # Create sections and chunks
            self.create_sections_and_chunks(session, page_uuid, page_dict['sections'])
            self.create_chunks(session, page_uuid, page_dict['chunks'], page_dict.get('first_chunk'))

    def create_sections_and_chunks(self, session, parent_uuid, sections):
        for section in sections:
            section_uuid = str(uuid.uuid4())
            section_labels = f":Section:{section['type']}"
            
            # Query to find either Page or Section as parent
            section_query = f"""
            MATCH (parent {{uuid: $parent_uuid}})
            MERGE (s{section_labels} {{name: $name}})
            ON CREATE SET s.uuid = $uuid
            MERGE (parent)-[:HAS_SECTION]->(s)
            RETURN s
            """
            session.run(section_query, parent_uuid=parent_uuid, name=section['name'], uuid=section_uuid)

            # Recursively create sub-sections and chunks
            self.create_sections_and_chunks(session, section_uuid, section['sections'])
            self.create_chunks(session, section_uuid, section['chunks'], section.get('first_chunk'))

    def create_chunks(self, session, parent_uuid, chunks, first_chunk=None):
        first_chunk_created = False
        
        for i, chunk in enumerate(chunks):
            # Reuse the existing chunk.id as the UUID
            chunk_uuid = chunk['id']
            
            chunk_query = """
            MATCH (parent {uuid: $parent_uuid})
            MERGE (c:Chunk {id: $id})
            ON CREATE SET c.uuid = $uuid
            MERGE (parent)-[:HAS_CHUNK]->(c)
            RETURN c
            """
            session.run(chunk_query, parent_uuid=parent_uuid, id=chunk['id'], uuid=chunk_uuid)

            # Create the HAS_FIRST_CHUNK relationship if first_chunk is specified and not yet created
            if first_chunk and not first_chunk_created and chunk['id'] == first_chunk['id']:
                first_chunk_query = """
                MATCH (parent {uuid: $parent_uuid}), (c:Chunk {id: $chunk_id})
                MERGE (parent)-[:HAS_FIRST_CHUNK]->(c)
                RETURN parent, c
                """
                session.run(first_chunk_query, parent_uuid=parent_uuid, chunk_id=chunk['id'])
                first_chunk_created = True

            # Set the NEXT relationship
            if i < len(chunks) - 1:
                next_chunk_query = """
                MATCH (c1:Chunk {id: $id1}), (c2:Chunk {id: $id2})
                MERGE (c1)-[:NEXT]->(c2)
                RETURN c1, c2
                """
                session.run(next_chunk_query, id1=chunk['id'], id2=chunks[i + 1]['id'])



# # Example usage
# page_dict = {
#     "title": "Dinosaurs",
#     "sections": [
#         {
#             "name": "Introduction",
#             "type": "h2",
#             "chunks": [{"id": "chunk1", "next": "chunk2"}],
#             "sections": [
#                 {
#                     "name": "History",
#                     "type": "h3",
#                     "chunks": [{"id": "chunk2", "next": None}],
#                     "sections": []
#                 }
#             ],
#             "first_chunk": {"id": "chunk1", "next": "chunk2"}
#         }
#     ],
#     "chunks": [],
#     "first_chunk": None
# }

# neo4j_creator = Neo4jGraphCreator("bolt://localhost:7687", "neo4j", "password")
# neo4j_creator.create_graph(page_dict)
# neo4j_creator.close()