In [1]:
from bs4 import BeautifulSoup
import re
from typing import List

def WikiPageChunks(html_str: str) -> List:
    soup = BeautifulSoup(html_str, 'html.parser')

    chunks = []

    def clean_text(text):
        cleaned_text = text.replace('\n', ' ').replace('\xa0', ' ')
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = re.sub(r'(\d)\s+(\d)', r'\1\2', cleaned_text)
        cleaned_text = cleaned_text.strip()
        return cleaned_text

    html_soup = soup.body or soup
    nested = ['ul', 'ol', 'dl', 'li', 'dt', 'dd']
    for tag in html_soup.find_all(recursive=False):
        if tag.name == 'p':
            chunks.append(clean_text(tag.get_text(separator=' ')))
        elif tag.name == 'link':
            continue
        elif tag.name in nested:
            list_items = tag.find_all('li')
            list_text = ' '.join([f"- {clean_text(li.get_text(separator=' '))}" for li in list_items])
            chunks.append(list_text)
        else:
            chunks.append(str(tag))
    
    return chunks

In [2]:
from haystack import Document
from haystack import component
import uuid

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    """
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        chunks = []
        for doc in documents:
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            i = 0
            for chunk in page_chunks:
                if chunk != "":
                    chunks.append(
                        Document(
                            id=str(uuid.uuid4()),
                            content=chunk,
                            meta={"file_path": doc.meta["file_path"], 
                                "source_id": doc.id,
                                "split_id": i}
                        )
                    )
                    i += 1
        
        return {"documents": chunks} 

In [3]:
from pathlib import Path
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack.components.writers import DocumentWriter
from haystack import Pipeline

converter = TextFileToDocument()
splitter = WikiPageChunker()
embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
document_store = WeaviateDocumentStore(url="http://localhost:8088")
document_writer = DocumentWriter(document_store=document_store)

indexing_pipeline = Pipeline()

indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", document_writer)

indexing_pipeline.connect("converter", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")

indexing_pipeline.run(data={"converter": {"sources": [Path("dinosaur-page.html")]}})

  from .autonotebook import tqdm as notebook_tqdm
  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()
Calculating embeddings: 100%|██████████| 5/5 [00:06<00:00,  1.27s/it]


{'embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 20925, 'total_tokens': 20925}}},
 'writer': {'documents_written': 135}}

In [4]:
print(document_store.count_documents())

135
