In [4]:
from pathlib import Path
from haystack.components.converters import TextFileToDocument

converter = TextFileToDocument()

docs = converter.run(sources=[Path("dinosaur-page.html")])
print(docs)

{'documents': [Document(id=cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32, content: '<link
  rel="mw-deduplicated-inline-style"
  href="mw-data:TemplateStyles:r1236090951"
/>
<p class="...', meta: {'file_path': 'dinosaur-page.html'})]}


In [5]:
from bs4 import BeautifulSoup
import re

def WikiPageChunks(html_str: str) -> list:
    soup = BeautifulSoup(html_str, 'html.parser')

    chunks = []

    def clean_text(text):
        cleaned_text = text.replace('\n', ' ').replace('\xa0', ' ')
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = re.sub(r'(\d)\s+(\d)', r'\1\2', cleaned_text)
        cleaned_text = cleaned_text.strip()
        return cleaned_text

    html_soup = soup.body or soup
    nested = ['ul', 'ol', 'dl', 'li', 'dt', 'dd']
    for tag in html_soup.find_all(recursive=False):
        if tag.name == 'p':
            chunks.append(clean_text(tag.get_text(separator=' ')))
        elif tag.name == 'link':
            continue
        elif tag.name in nested:
            list_items = tag.find_all('li')
            list_text = ' '.join([f"- {clean_text(li.get_text(separator=' '))}" for li in list_items])
            chunks.append(list_text)
        else:
            chunks.append(str(tag))
    
    return chunks

In [6]:
html_content = docs["documents"][0].content

## Custom splitter which captures hierarchial metadata

In [7]:
from haystack import Document
from haystack import component
from typing import List
import uuid

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    - The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    - Each chunk is a paragraph, list, or table in the Wikipedia page.
    - Each chunk is stored as a separate document with text in 'content' field
    - Each chunk also stores title, h2, h3 etc in meta field.
    - Custom component also creates a hierarchial structure of the chunks based on title, h2, h3 etc.
    """
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        chunks = []
        for doc in documents:
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            i = 0
            current_h2 = ""
            current_h3 = ""
            current_h4 = ""
            # Add more headers if needed

            for chunk in page_chunks:
                if chunk == "":
                    continue
                if chunk.startswith("<h2>"):
                    current_h2 = chunk[4:-5]  # Extract text between <h2> and </h2>
                    current_h3 = ""  # Reset h3 when a new h2 is found
                    current_h4 = ""  # Reset h4 when a new h2 is found
                elif chunk.startswith("<h3>"):
                    current_h3 = chunk[4:-5]  # Extract text between <h3> and </h3>
                    current_h4 = ""  # Reset h4 when a new h3 is found
                elif chunk.startswith("<h4>"):
                    current_h4 = chunk[4:-5]  # Extract text between <h4> and </h4>
                else:
                    meta = {
                        "file_path": doc.meta["file_path"],
                        "source_id": doc.id,
                        "split_id": i,
                        "title": "Dinosaurs"
                    }
                    if current_h2:
                        meta["h2"] = current_h2
                    if current_h3:
                        meta["h3"] = current_h3
                    if current_h4:
                        meta["h4"] = current_h4

                    chunks.append(
                        Document(
                            id=str(uuid.uuid4()),
                            content=chunk,
                            meta=meta
                        )
                    )
                    i += 1
        
        return {"documents": chunks}

In [8]:
splitter = WikiPageChunker()

chunks = splitter.run(docs["documents"])

chunks

{'documents': [Document(id=4a0a9cc6-2594-4c9d-a5f5-b1e078005bdf, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 0, 'title': 'Dinosaurs'}),
  Document(id=356b86c0-1d0a-44a7-9cb2-11e6fe29dfa3, content: 'Dinosaurs are varied from taxonomic, morphological and ecological standpoints. Birds, at over 11,000...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 1, 'title': 'Dinosaurs'}),
  Document(id=07b0a134-911c-4697-8b17-be8f26c71161, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 2, 'title': 'Dinosaurs'}