In [1]:
from bs4 import BeautifulSoup
import re
from typing import List

def WikiPageChunks(html_str: str) -> List:
    soup = BeautifulSoup(html_str, 'html.parser')

    chunks = []

    def clean_text(text):
        cleaned_text = text.replace('\n', ' ').replace('\xa0', ' ')
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = re.sub(r'(\d)\s+(\d)', r'\1\2', cleaned_text)
        cleaned_text = cleaned_text.strip()
        return cleaned_text

    html_soup = soup.body or soup
    nested = ['ul', 'ol', 'dl', 'li', 'dt', 'dd']
    for tag in html_soup.find_all(recursive=False):
        if tag.name == 'p':
            chunks.append(clean_text(tag.get_text(separator=' ')))
        elif tag.name == 'link':
            continue
        elif tag.name in nested:
            list_items = tag.find_all('li')
            list_text = ' '.join([f"- {clean_text(li.get_text(separator=' '))}" for li in list_items])
            chunks.append(list_text)
        else:
            chunks.append(str(tag))
    
    return chunks

In [2]:
from haystack import Document
from haystack import component
import uuid

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    """
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        chunks = []
        for doc in documents:
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            i = 0
            for chunk in page_chunks:
                if chunk != "":
                    chunks.append(
                        Document(
                            id=str(uuid.uuid4()),
                            content=chunk,
                            meta={"file_path": doc.meta["file_path"], 
                                "source_id": doc.id,
                                "split_id": i}
                        )
                    )
                    i += 1
        
        return {"documents": chunks} 

In [12]:
from pathlib import Path
from haystack.components.converters import TextFileToDocument
from haystack import Pipeline
from haystack import Document
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from elasticsearch import Elasticsearch

converter = TextFileToDocument()
splitter = WikiPageChunker()
document_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")

indexing_pipeline = Pipeline()

indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("splitter", splitter)

indexing_pipeline.connect("converter", "splitter")

docs = indexing_pipeline.run(data={"converter": {"sources": [Path("../dinosaur-page.html")]}})

In [13]:
docs

{'splitter': {'documents': [Document(id=d1c8d5ba-453c-43bb-ba24-4e3f651d848a, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 0}),
   Document(id=119324df-f9dd-4b8d-b1a6-69adc7be107d, content: 'Dinosaurs are varied from taxonomic, morphological and ecological standpoints. Birds, at over 11,000...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 1}),
   Document(id=e4c39230-4147-426b-8c07-6392e3af3140, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 2}),
   Document(id=9d539a78-d8ea-4f09-8e58-

In [14]:
documents = docs['splitter']['documents']

In [16]:
document_store.write_documents(documents=documents)

135

In [18]:
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever

retriever = ElasticsearchBM25Retriever(document_store=document_store)

In [23]:
question = "What is Mamenchisaurus?"

fetched = retriever.run(query=question, top_k=1)

In [24]:
fetched

{'documents': [Document(id=3f78f1ee-610f-4e7d-8c19-78cb1c1e365f, content: 'Dinosaur evolution after the Triassic followed changes in vegetation and the location of continents....', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 44}, score: 3.050006)]}

Found the correct chunk

In [25]:
question = "What is Averostra?"

fetched = retriever.run(query=question, top_k=1)
fetched

{'documents': [Document(id=1e8ed23c-c7b0-4eeb-aec9-8a6654748d1d, content: 'The first potential evidence for herding or flocking as a widespread behavior common to many dinosau...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 76}, score: 2.750564)]}

Wrong chunk!

In [26]:
question = "What is Oryctodromeus?"

fetched = retriever.run(query=question, top_k=1)
fetched

{'documents': [Document(id=b0beaf46-b54e-4247-a61e-8ab834022788, content: 'Based on fossil evidence from dinosaurs such as Oryctodromeus , some ornithischian species seem to h...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 80}, score: 4.9544535)]}

Correct chunk

In [27]:
question = "Who is Heilmann?"

fetched = retriever.run(query=question, top_k=1)
fetched

{'documents': [Document(id=11ffe376-8ca3-437b-a8fd-97877f74d834, content: 'The possibility that dinosaurs were the ancestors of birds was first suggested in 1868 by Thomas Hen...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 99}, score: 5.110334)]}

Correct chunk

In [28]:
question = "What is Compsognathus?"

fetched = retriever.run(query=question, top_k=1)
fetched

{'documents': [Document(id=23c1fe3c-2bea-4277-a32f-f02893c3fe3e, content: 'Archaeopteryx was the first fossil found that revealed a potential connection between dinosaurs and ...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 103}, score: 5.787794)]}