In [7]:
from pathlib import Path
from haystack.components.converters import TextFileToDocument

converter = TextFileToDocument()

docs = converter.run(sources=[Path("Dinosaur.html")])
print(docs)

{'documents': [Document(id=93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e, content: '<link
  rel="mw-deduplicated-inline-style"
  href="mw-data:TemplateStyles:r1236090951"
/>
<p class="...', meta: {'file_path': 'Dinosaur.html'})]}


In [8]:
from bs4 import BeautifulSoup
import re

def WikiPageChunks(html_str: str) -> list:
    soup = BeautifulSoup(html_str, 'html.parser')  # Parse the HTML content

    chunks = []

    def clean_text(text):
        cleaned_text = text.replace('\n', ' ').replace('\xa0', ' ')  # Replace newlines and non-breaking spaces with regular spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        cleaned_text = re.sub(r'(\d)\s+(\d)', r'\1\2', cleaned_text)  # Remove spaces between digits
        cleaned_text = cleaned_text.strip()  # Remove leading and trailing spaces
        return cleaned_text

    html_soup = soup.body or soup  # Use the body of the HTML if it exists, otherwise use the whole soup
    nested = ['ul', 'ol', 'dl', 'li', 'dt', 'dd']  # Tags that represent nested lists
    for tag in html_soup.find_all(recursive=False):  # Iterate over top-level tags in the HTML
        if tag.name == 'p':
            chunks.append(clean_text(tag.get_text(separator=' ')))  # Clean and add paragraph text to chunks
        elif tag.name == 'link':
            continue  # Skip link tags
        elif tag.name in nested:
            list_items = tag.find_all('li')  # Find all list items
            for li in list_items:
                chunks.append(clean_text(li.get_text(separator=' ')))  # Clean and add each list item text to chunks
        else:
            chunks.append(str(tag))  # Add other tags as strings
    
    return chunks

***NOTE: List items are now independently chunked. A cursory glance at few Dinosaur wiki pages suggests that list items generally have dense, terse information. Chunking separately may benefit retrieval.***

In [9]:
html_content = docs["documents"][0].content

## Custom splitter which captures hierarchial metadata

In [10]:
from haystack import Document
from haystack import component
from typing import List
import uuid

class Category:
    categories: List[Category]
    pages: List[Page]
    
class Page:
    title: str
    sections: List[Section]
    chunks: List[Chunk]
    first_chunk: Chunk
    
class Section:
    name: str
    chunks: List[Chunk]
    sections: List[Section]
    first_chunk: Chunk
    header: str
    
    
class Chunk:
    id: str
    next: str

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    - The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    - Each chunk is a paragraph, list, or table in the Wikipedia page.
    - Each chunk is stored as a separate document with text in 'content' field
    - Each chunk also stores title, h2, h3 etc in meta field.
    - Custom component also creates a hierarchial structure of the chunks based on title, h2, h3 etc.
    """
    @component.output_types(documents=List[Document], hierarchy=dict)
    def run(self, documents: List[Document]):
        chunks = []
        hierarchy = dict()
        for doc in documents:
            page_title = doc.meta["file_path"].replace(".html", "")
            hierarchy[page_title] = dict()
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            i = 0
            current_h2 = ""
            current_h3 = ""
            current_h4 = ""
            # Add more headers if needed

            for chunk in page_chunks:
                if chunk == "":
                    continue
                if chunk.startswith("<h2>"):
                    current_h2 = chunk[4:-5]  # Extract text between <h2> and </h2>
                    current_h3 = ""  # Reset h3 when a new h2 is found
                    current_h4 = ""  # Reset h4 when a new h2 is found
                elif chunk.startswith("<h3>"):
                    current_h3 = chunk[4:-5]  # Extract text between <h3> and </h3>
                    current_h4 = ""  # Reset h4 when a new h3 is found
                elif chunk.startswith("<h4>"):
                    current_h4 = chunk[4:-5]  # Extract text between <h4> and </h4>
                else:
                    meta = {
                        "file_path": doc.meta["file_path"],
                        "source_id": doc.id,
                        "split_id": i,
                        "title": page_title
                    }
                    if current_h2:
                        meta["h2"] = current_h2
                    if current_h3:
                        meta["h3"] = current_h3
                    if current_h4:
                        meta["h4"] = current_h4

                    chunks.append(
                        Document(
                            id=str(uuid.uuid4()),
                            content=chunk,
                            meta=meta
                        )
                    )
                    i += 1
        
        return {"documents": chunks}

In [11]:
splitter = WikiPageChunker()

chunks = splitter.run(docs["documents"])

chunks

{'documents': [Document(id=9646b6cb-7aed-40ce-b41c-78f053c24b4b, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 0, 'title': 'Dinosaur'}),
  Document(id=e12fdf34-815b-46cc-a621-91799d6e8ba0, content: 'Dinosaurs are varied from taxonomic, morphological and ecological standpoints. Birds, at over 11,000...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 1, 'title': 'Dinosaur'}),
  Document(id=7b33e45e-711e-41bf-b3be-a15a7d4b9270, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 2, 'title': 'Dinosaur'}),
  Document(id=d

Changes from previous chunking
- Each list item now has its own chunk
- Each document is decorated with hierarchical metadata (title, h2, h3, h4 etc.)

Following chunker also attempts to construct the hierarchy dictionary

- Chunker which returns hierarchy as dict:

In [12]:
from typing import List, Dict, Any
import uuid

class Chunk:
    def __init__(self, id: str, next: str = None):
        self.id = id
        self.next = next

class Section:
    def __init__(self, name: str, type: str):
        self.name = name
        self.type = type
        self.chunks = []
        self.sections = []
        self.first_chunk = None

class Page:
    def __init__(self, title: str):
        self.title = title
        self.sections = []
        self.chunks = []
        self.first_chunk = None

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    - The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    - Each chunk is a paragraph, list, or table in the Wikipedia page.
    - Each chunk is stored as a separate document with text in 'content' field
    - Each chunk also stores title, h2, h3 etc in meta field.
    - Custom component also creates a hierarchical structure of the chunks based on title, h2, h3 etc.
    """
    @component.output_types(documents=List[Document], hierarchy=dict)
    def run(self, documents: List[Document]):
        chunks = []
        hierarchy = {}
        
        for doc in documents:
            page_title = doc.meta["file_path"].replace(".html", "")
            page = Page(page_title)
            
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            i = 0
            current_h2 = ""
            current_h3 = ""
            current_h4 = ""
            current_section = None
            current_sub_section = None
            current_sub_sub_section = None

            for chunk in page_chunks:
                if chunk == "":
                    continue
                if chunk.startswith("<h2>"):
                    current_h2 = chunk[4:-5]  # Extract text between <h2> and </h2>
                    current_h3 = ""  # Reset h3 when a new h2 is found
                    current_h4 = ""  # Reset h4 when a new h2 is found
                    current_section = Section(current_h2, "h2")
                    page.sections.append(current_section)
                    current_sub_section = None
                    current_sub_sub_section = None
                elif chunk.startswith("<h3>"):
                    current_h3 = chunk[4:-5]  # Extract text between <h3> and </h3>
                    current_h4 = ""  # Reset h4 when a new h3 is found
                    if current_section:
                        current_sub_section = Section(current_h3, "h3")
                        current_section.sections.append(current_sub_section)
                        current_sub_sub_section = None
                elif chunk.startswith("<h4>"):
                    current_h4 = chunk[4:-5]  # Extract text between <h4> and </h4>
                    if current_sub_section:
                        current_sub_sub_section = Section(current_h4, "h4")
                        current_sub_section.sections.append(current_sub_sub_section)
                else:
                    meta = {
                        "file_path": doc.meta["file_path"],
                        "source_id": doc.id,
                        "split_id": i,
                        "title": "Dinosaurs"
                    }
                    if current_h2:
                        meta["h2"] = current_h2
                    if current_h3:
                        meta["h3"] = current_h3
                    if current_h4:
                        meta["h4"] = current_h4

                    chunk_obj = Chunk(str(uuid.uuid4()))
                    chunks.append(
                        Document(
                            id=chunk_obj.id,
                            content=chunk,
                            meta=meta
                        )
                    )
                    if current_sub_sub_section:
                        current_sub_sub_section.chunks.append(chunk_obj)
                        if current_sub_sub_section.first_chunk is None:
                            current_sub_sub_section.first_chunk = chunk_obj
                    elif current_sub_section:
                        current_sub_section.chunks.append(chunk_obj)
                        if current_sub_section.first_chunk is None:
                            current_sub_section.first_chunk = chunk_obj
                    elif current_section:
                        current_section.chunks.append(chunk_obj)
                        if current_section.first_chunk is None:
                            current_section.first_chunk = chunk_obj
                    else:
                        page.chunks.append(chunk_obj)
                        if page.first_chunk is None:
                            page.first_chunk = chunk_obj
                    i += 1
            
            self.set_next_chunks(page)
            hierarchy[page_title] = self.page_to_dict(page)
        
        return {"documents": chunks, "hierarchy": hierarchy}

    def set_next_chunks(self, page: Page):
        for section in page.sections:
            self.set_next_chunks_in_section(section)
        self.set_next_in_list(page.chunks)

    def set_next_chunks_in_section(self, section: Section):
        for sub_section in section.sections:
            self.set_next_chunks_in_section(sub_section)
        self.set_next_in_list(section.chunks)

    def set_next_in_list(self, chunks: List[Chunk]):
        for i in range(len(chunks) - 1):
            chunks[i].next = chunks[i + 1].id
        if chunks:
            chunks[-1].next = None

    def page_to_dict(self, page: Page) -> Dict[str, Any]:
        return {
            "title": page.title,
            "sections": [self.section_to_dict(section) for section in page.sections],
            "chunks": [self.chunk_to_dict(chunk) for chunk in page.chunks],
            "first_chunk": self.chunk_to_dict(page.first_chunk) if page.first_chunk else None
        }

    def section_to_dict(self, section: Section) -> Dict[str, Any]:
        return {
            "name": section.name,
            "type": section.type,
            "chunks": [self.chunk_to_dict(chunk) for chunk in section.chunks],
            "sections": [self.section_to_dict(sub_section) for sub_section in section.sections],
            "first_chunk": self.chunk_to_dict(section.first_chunk) if section.first_chunk else None
        }

    def chunk_to_dict(self, chunk: Chunk) -> Dict[str, Any]:
        return {
            "id": chunk.id,
            "next": chunk.next
        }

In [13]:
splitter = WikiPageChunker()

chunk_result = splitter.run(docs["documents"])

chunk_result["documents"]

[Document(id=63a5c09a-fbf7-492d-8d34-24f47494ad16, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 0, 'title': 'Dinosaurs'}),
 Document(id=b37e6cba-5dce-4028-b185-78d81089cc09, content: 'Dinosaurs are varied from taxonomic, morphological and ecological standpoints. Birds, at over 11,000...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 1, 'title': 'Dinosaurs'}),
 Document(id=e9f55fc2-e0cc-4c98-8241-e11993d19eae, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 2, 'title': 'Dinosaurs'}),
 Document(id=2f8a95c1-558f-4

In [14]:
chunk_result["hierarchy"]

{'Dinosaur': {'title': 'Dinosaur',
  'sections': [{'name': 'Definition',
    'type': 'h2',
    'chunks': [{'id': '7f510ce9-b79f-426e-8119-8b5979bcba67',
      'next': '92e3e386-1b34-4152-8919-bc9adc4ec4e7'},
     {'id': '92e3e386-1b34-4152-8919-bc9adc4ec4e7',
      'next': 'c2882d5b-c557-4bcf-a14c-d766839f90c5'},
     {'id': 'c2882d5b-c557-4bcf-a14c-d766839f90c5', 'next': None}],
    'sections': [{'name': 'General description',
      'type': 'h3',
      'chunks': [{'id': '6d7ed4e2-ab1c-459f-a9f0-f49b0c6fd79d',
        'next': '867ca509-30ab-4bdf-b74a-a2bf892b1cac'},
       {'id': '867ca509-30ab-4bdf-b74a-a2bf892b1cac',
        'next': '375874f8-3906-41be-873c-00f93b29f4dd'},
       {'id': '375874f8-3906-41be-873c-00f93b29f4dd', 'next': None}],
      'sections': [],
      'first_chunk': {'id': '6d7ed4e2-ab1c-459f-a9f0-f49b0c6fd79d',
       'next': '867ca509-30ab-4bdf-b74a-a2bf892b1cac'}},
     {'name': 'Distinguishing anatomical features',
      'type': 'h3',
      'chunks': [{'id': 'db