In [1]:
from bs4 import BeautifulSoup
import re
from typing import List

def WikiPageChunks(html_str: str) -> List:
    soup = BeautifulSoup(html_str, 'html.parser')

    chunks = []

    def clean_text(text):
        cleaned_text = text.replace('\n', ' ').replace('\xa0', ' ')
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = re.sub(r'(\d)\s+(\d)', r'\1\2', cleaned_text)
        cleaned_text = cleaned_text.strip()
        return cleaned_text

    html_soup = soup.body or soup
    nested = ['ul', 'ol', 'dl', 'li', 'dt', 'dd']
    for tag in html_soup.find_all(recursive=False):
        if tag.name == 'p':
            chunks.append(clean_text(tag.get_text(separator=' ')))
        elif tag.name == 'link':
            continue
        elif tag.name in nested:
            list_items = tag.find_all('li')
            list_text = ' '.join([f"- {clean_text(li.get_text(separator=' '))}" for li in list_items])
            chunks.append(list_text)
        else:
            chunks.append(str(tag))
    
    return chunks

In [2]:
from haystack import Document
from haystack import component
import uuid

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    """
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        chunks = []
        for doc in documents:
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            i = 0
            for chunk in page_chunks:
                if chunk != "":
                    chunks.append(
                        Document(
                            id=str(uuid.uuid4()),
                            content=chunk,
                            meta={"file_path": doc.meta["file_path"], 
                                "source_id": doc.id,
                                "split_id": i}
                        )
                    )
                    i += 1
        
        return {"documents": chunks} 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from pathlib import Path
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore


converter = TextFileToDocument()
splitter = WikiPageChunker()
embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
w_store = WeaviateDocumentStore(url="http://localhost:8088")
w_writer = DocumentWriter(document_store=w_store)
e_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")
e_writer = DocumentWriter(document_store=e_store)

indexing_pipeline = Pipeline()

indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("w_writer", w_writer)
indexing_pipeline.add_component("e_writer", e_writer)

indexing_pipeline.connect("converter", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "w_writer")
indexing_pipeline.connect("splitter", "e_writer")

indexing_pipeline.run(data={"converter": {"sources": [Path("dinosaur-page.html")]}})

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()
Calculating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it]


{'embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 695, 'total_tokens': 695}}},
 'e_writer': {'documents_written': 4},
 'w_writer': {'documents_written': 4}}

In [4]:
print(e_store.count_documents())

4


In [5]:
print(w_store.count_documents())

4


Question: Can one indexing pipeline be used to write into 2 store?

Answer: Yes.

Question: Can the documents in both stores be correlated? Will tehy have the same ids?


In [7]:
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever

eretriever = ElasticsearchBM25Retriever(document_store=e_store)

In [8]:
from haystack_integrations.components.retrievers.weaviate.bm25_retriever import WeaviateBM25Retriever

w_retriever = WeaviateBM25Retriever(document_store=w_store)

In [9]:
question = "Dinosaurs"

elastic_fetched = eretriever.run(query=question, top_k=4)
weaviate_fetched = w_retriever.run(query=question, top_k=4)

In [10]:
elastic_fetched

{'documents': [Document(id=d592ed23-4543-4a02-85ea-000d1d408ac3, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'beae8a4fccec875985e8817cfdb53011adcf64cf1e57ba160f710fdf6641cccb', 'split_id': 0}, score: 0.37987244, embedding: vector of size 1536),
  Document(id=218342f4-2cc2-467f-9ced-4370add023a9, content: 'The first dinosaur fossils were recognized in the early 19th century, with the name "dinosaur" (mean...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'beae8a4fccec875985e8817cfdb53011adcf64cf1e57ba160f710fdf6641cccb', 'split_id': 3}, score: 0.31545743, embedding: vector of size 1536),
  Document(id=d52a5246-5c61-499d-905b-7be544efb4b5, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'beae8a4fccec875985e8817cfdb53011adcf64cf1e57b

In [11]:
weaviate_fetched

{'documents': [Document(id=ae3a9483-fcfe-4ce5-a818-058e1b1dac5d, content: 'Dinosaurs are varied from taxonomic, morphological and ecological standpoints. Birds, at over 11,000...', meta: {'split_id': 1.0, 'source_id': 'beae8a4fccec875985e8817cfdb53011adcf64cf1e57ba160f710fdf6641cccb', 'file_path': 'dinosaur-page.html'}, score: 0.09117916226387024, embedding: vector of size 1536),
  Document(id=d592ed23-4543-4a02-85ea-000d1d408ac3, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'source_id': 'beae8a4fccec875985e8817cfdb53011adcf64cf1e57ba160f710fdf6641cccb', 'split_id': 0.0, 'file_path': 'dinosaur-page.html'}, score: 0.09064868092536926, embedding: vector of size 1536),
  Document(id=d52a5246-5c61-499d-905b-7be544efb4b5, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'split_id': 2.0, 'source_id': 'beae8a4fccec875985e8817cfdb53011adcf64cf1

Yes, documents in the two stores will have the same IDs.