In [1]:
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def index_wiki_data(category: str, filepath: str, category_pages_indexed: dict) -> int:
    """
    Indexes already chunked wiki data for all pages in a category and its subcategories. Chunked data is available in 
    the .metadata/chunk directory. The intermediate embeddings are stored in the .metadata/index/embeddings directory.
    
    List of Haystack Document objects is created from stored chunks and stored into three databases:
    - ElasticsearchDocumentStore: for full-text search (list of Document objects without embeddings is stored)
    - WeaviateDocumentStore: for vector search (list of Document objects enriched with embeddings is stored)
    - Neo4j: for graph search (list of Document objects are stored as Chunk type nodes and Section, Page, Category type nodes
    are created to represent the structure of the data)
    
    pseuodocode:
    - get pages and subcategories from title_pathname map
    - create pages_filename_set and categories_dirname_set
    
    - for each page in pages:
        - if page is not already indexed (check in redis set):
        - if page not in pages_filename_set:
            - call get_chunks to get list of Document objects from chunks
            - store documents in ElasticsearchDocumentStore (DocumentWriter(document_store = document_store, policy=DuplicatePolicy.SKIP)
            - call get_embedded_documents to get list of Document objects with embeddings
            - store documents in WeaviateDocumentStore (DocumentWriter(document_store = document_store, policy=DuplicatePolicy.SKIP)
            - call create_graph to create graph representation of the data in Neo4j -- ensure duplicate nodes and edges not created
            - add page to redis set
    
    - for each subcategory in categories:
        - if subcategory is not already indexed (check in redis set):
        - if subcategory not in categories_dirname_set:
            - call index_wiki_data recursively on subcategory
            - add subcategory to redis set
        

    """