### Embed and store
- Convert fetched html of wiki page using TextFileToDocument
- Run document through custom wiki page chunker 
- Embed chunks using OpenAIEmbedding
- Store document chunks using WeaviateDocumentStore 

Helpers


In [18]:
from bs4 import BeautifulSoup
import re
from typing import List

def WikiPageChunks(html_str: str) -> List:
    soup = BeautifulSoup(html_str, 'html.parser')

    chunks = []

    def clean_text(text):
        cleaned_text = text.replace('\n', ' ').replace('\xa0', ' ')
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = re.sub(r'(\d)\s+(\d)', r'\1\2', cleaned_text)
        cleaned_text = cleaned_text.strip()
        return cleaned_text

    html_soup = soup.body or soup
    nested = ['ul', 'ol', 'dl', 'li', 'dt', 'dd']
    for tag in html_soup.find_all(recursive=False):
        if tag.name == 'p':
            chunks.append(clean_text(tag.get_text(separator=' ')))
        elif tag.name == 'link':
            continue
        elif tag.name in nested:
            list_items = tag.find_all('li')
            list_text = ' '.join([f"- {clean_text(li.get_text(separator=' '))}" for li in list_items])
            chunks.append(list_text)
        else:
            chunks.append(str(tag))
    
    return chunks

In [19]:
from haystack import Document
from haystack import component
import uuid

@component
class WikiPageChunker:
    """
    A component that splits the content of Wikipedia pages into chunks.
    The document content is expected to be in HTML format fetched via wikipediaapi and
    which has been run through TextFileToDocument converter.
    """
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        chunks = []
        for doc in documents:
            html_content = doc.content
            page_chunks = WikiPageChunks(html_content)
            for i, chunk in enumerate(page_chunks):
                if chunk != "":
                    chunks.append(
                        Document(
                            id=str(uuid.uuid4()),
                            content=chunk,
                            meta={"file_path": doc.meta["file_path"], 
                                "source_id": doc.id,
                                "split_id": i}  # i can be discontinuous due to 'if chunk 1 != ""' condition
                        )
                    )
        
        return {"documents": chunks} 

### Pipeline: text to embeddings - OpenAIDocumentEmbedder

In [23]:
from pathlib import Path
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack import Pipeline

converter = TextFileToDocument()
splitter = WikiPageChunker()
embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")

embedding_pipeline = Pipeline()

embedding_pipeline.add_component("converter", converter)
embedding_pipeline.add_component("splitter", splitter)
embedding_pipeline.add_component("embedder", embedder)

embedding_pipeline.connect("converter", "splitter")
embedding_pipeline.connect("splitter", "embedder")

embedded_chunks = embedding_pipeline.run(data={"converter": {"sources": [Path("dinosaur-page.html")]}})

embedded_chunks

Calculating embeddings: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]


{'embedder': {'documents': [Document(id=d8461939-6728-4732-b84c-0b0bde1bd6ca, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 1}, embedding: vector of size 1536),
   Document(id=578a7f2d-86ce-4143-90c2-ba2952db4693, content: 'Dinosaurs are varied from taxonomic, morphological and ecological standpoints. Birds, at over 11,000...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 2}, embedding: vector of size 1536),
   Document(id=a9230f7b-113d-4fbf-beb6-261a88936aa4, content: 'While dinosaurs were ancestrally bipedal, many extinct groups included quadrupedal species, and some...', meta: {'file_path': 'dinosaur-page.html', 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', '

### Save embeddings to document store - WeaviateDocumentStore

In [24]:
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack import Document

document_store = WeaviateDocumentStore(url="http://localhost:8088")



In [25]:
document_store.write_documents(embedded_chunks["embedder"]["documents"])

135

In [26]:
print(document_store.count_documents())

135
