### Imports

In [1]:
import sys
import os

# Add the parent directory of 'wiki' to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import json
import redis
from typing import List, Dict
from pathlib import Path
from haystack import Document
from typing import Tuple
from haystack.document_stores.types import DuplicatePolicy
from wiki.lib.index.graph.page_graph_creator import Neo4jPageGraphCreator
from wiki.lib.index.graph.category_graph_creator import Neo4jCategoryGraphCreator
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore


  from .autonotebook import tqdm as notebook_tqdm


### Global variables (kind of)

In [2]:
r = redis.Redis(host='localhost', port=6379, db=0)
embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
w_store = WeaviateDocumentStore(url="http://localhost:8088")
w_writer = DocumentWriter(document_store=w_store, policy=DuplicatePolicy.SKIP)
e_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")
e_writer = DocumentWriter(document_store=e_store, policy=DuplicatePolicy.SKIP)


In [14]:
def get_title_pathname_map(filepath: str) -> dict:
    """
    Reads the title_pathname map from a file. Returns an empty dictionary if the file does not exist.
    """
    title_pathname_filepath = os.path.join(filepath, ".metadata/download/title_pathname.json")
    if not os.path.exists(title_pathname_filepath):
        raise FileNotFoundError(f"The file '{title_pathname_filepath}' does not exist.")
    
    with open(title_pathname_filepath, "r") as file:
        title_pathname = json.load(file)
    
    return title_pathname

def get_documents_and_page_hierarchy(filepath: str, page_title: str, page_filename: str) -> Tuple[List[Document], dict]:
    """
    Extracts the documents and hierarchy of a page from the stored chunks in the .metadata/chunk/{page_filename}.json file.
    """
    page_filename_wo_ext = os.path.splitext(page_filename)[0]
    chunk_filepath = os.path.join(filepath, ".metadata/chunk", f"{page_filename_wo_ext}.json")
    if not os.path.exists(chunk_filepath):
        raise FileNotFoundError(f"The file '{chunk_filepath}' does not exist.")
    
    with open(chunk_filepath, "r") as file:
        data = json.load(file)
    
    if not "splitter" in data:
        raise KeyError(f"The 'splitter' key is missing in the chunk file {chunk_filepath}.")
    if not "documents" in data["splitter"]:
        raise KeyError(f"The 'documents' key is missing in the 'splitter' key in the chunk file {chunk_filepath}.")
    if not "hierarchy" in data["splitter"]:
        raise KeyError(f"The 'hierarchy' key is missing in the 'splitter' key in the chunk file {chunk_filepath}.")
    
    documents = [Document.from_dict(doc) for doc in data["splitter"]["documents"]]  # convert dict to Haystack Document object
    hierarchy = data["splitter"]["hierarchy"][page_title]

    return documents, hierarchy


def store_documents_elasticsearch(documents: List[Document]) -> None:
    """
    Store documents in ElasticsearchDocumentStore.
    """
    e_writer.run(documents=documents)
    
def get_embedded_documents(documents: List[Document], filepath: str, page_filename: str) -> List[Document]:
    """
    Get embedded documents from ./metadta/index/embeddings. Create embeddings using the OpenAIDocumentEmbedder and store
    the embedded documents in the same directory if they do not already exist.
    """
    page_filename_wo_ext = os.path.splitext(page_filename)[0]
    embeddings_filepath = os.path.join(filepath, ".metadata/index/embeddings", f"{page_filename_wo_ext}.json")
    
    if os.path.exists(embeddings_filepath):
        with open(embeddings_filepath, "r") as file:
            data = json.load(file)
        return [Document.from_dict(doc) for doc in data["documents"]]
    
    metadata_emdeddings_path = os.path.join(filepath, ".metadata/index/embeddings")
    if not os.path.exists(metadata_emdeddings_path):
        os.makedirs(metadata_emdeddings_path)
    
    # Create embeddings and store embedded documents
    embedded_documents = embedder.run(documents=documents) 
    if 'documents' not in embedded_documents:
        raise KeyError("The 'documents' key is missing in the embedded_documents returned from embedder.")
    embedded_docs_file_to_save = {
        "documents": [doc.to_dict() for doc in embedded_documents["documents"]],    # convert Haystack Document object to dict
        "meta": embedded_documents["meta"] if 'meta' in embedded_documents else {}
    }
    with open(embeddings_filepath, "w") as file:
        json.dump(embedded_docs_file_to_save, file)
        
    return embedded_documents["documents"]



def store_documents_weaviate(documents: List[Document]) -> None:
    """
    store documents in WeaviateDocumentStore.
    """
    w_writer.run(documents=documents)
    

def index_wiki_pages(category: str, filepath: str, category_pages_indexed: Dict[str, int], page_graph_creator: Neo4jPageGraphCreator, depth: int) -> int:
    """
    Indexes already chunked wiki data for all pages in a category and its subcategories. Chunked data is available in 
    the .metadata/chunk directory. The intermediate embeddings are stored in the .metadata/index/embeddings directory.
    
    List of Haystack Document objects is created from stored chunks and stored into three databases:
    - ElasticsearchDocumentStore: for full-text search (list of Document objects without embeddings is stored)
    - WeaviateDocumentStore: for vector search (list of Document objects enriched with embeddings is stored)
    - Neo4j: for graph search (list of Document objects are stored as Chunk type nodes and Section, Page, Category type nodes
    are created to represent the structure of the data)
    """
    if depth > 100:
        return 0
    
    title_pathname = get_title_pathname_map(filepath)
    
    pages_filename_set = {file.name for file in Path(filepath).glob("*.html")}
    categories_dirname_set = {dir.name for dir in Path(filepath).iterdir() if dir.is_dir() and dir.name != ".metadata"}
    
    num_total_pages_indexed = 0
    
    pages = title_pathname["pages"]
    for page_title, page_filename in pages.items():
        if r.sismember("indexed_pages", page_title):
            continue
        if page_filename not in pages_filename_set:
            continue
        documents, hierarchy = get_documents_and_page_hierarchy(filepath, page_title, page_filename)
        store_documents_elasticsearch(documents)
        embedded_documents = get_embedded_documents(documents, filepath, page_filename)
        store_documents_weaviate(embedded_documents)
        page_graph_creator.create_graph(hierarchy)
        r.sadd("indexed_pages", page_title)
        num_total_pages_indexed += 1
    
    if num_total_pages_indexed > 0:
        category_pages_indexed[category] = num_total_pages_indexed
    
    subcategories = title_pathname["categories"]
    for subcategory_title, subcategory_path in subcategories.items():
        if r.sismember("indexed_categories", subcategory_title):
            continue
        if subcategory_path not in categories_dirname_set:
            continue
        subcategory_path = os.path.join(filepath, subcategory_path)
        num_total_pages_indexed += index_wiki_pages(subcategory_title, subcategory_path, category_pages_indexed, page_graph_creator, depth+1)
        r.sadd("indexed_categories", subcategory_title)
    
    return num_total_pages_indexed

In [30]:
def create_category_graph(category: str, filepath: str, category_graph_creator: Neo4jCategoryGraphCreator) -> int:
    """
    Creates a graph representation of the category and connections to its subcategories and pages. The graph is created 
    on top of the individual page hierarchy graphs already existing in Neo4j. 
    """
    title_pathname = get_title_pathname_map(filepath)
    
    pages = title_pathname["pages"]
    for page_title in pages:
        if not r.sismember("indexed_pages", page_title):
            continue
        category_graph_creator.create_category_to_page_relationship(category, page_title)
    
    subcategories = title_pathname["categories"]
    for subcategory_title, subcategory_path in subcategories.items():
        if not r.sismember("indexed_categories", subcategory_title):
            continue
        category_graph_creator.create_category_to_subcategory_relationship(category, subcategory_title)
        subcategory_path = os.path.join(filepath, subcategory_path)
        create_category_graph(subcategory_title, subcategory_path, category_graph_creator)
    

In [31]:
def index_wiki_data(category: str, filepath: str, page_graph_creator: Neo4jPageGraphCreator, category_graph_creator: Neo4jCategoryGraphCreator, depth: int = 0) -> None:
    """
    Indexes the wiki data for a category and its subcategories. The data is indexed in ElasticsearchDocumentStore, 
    WeaviateDocumentStore, and Neo4j. The graph representation of the category and its subcategories is created in Neo4j.
    """
    
    category_pages_indexed = {}
    num_total_pages_indexed = index_wiki_pages(category, filepath, category_pages_indexed, page_graph_creator, depth)
    create_category_graph(category, filepath, category_graph_creator)
    
    print(f"Indexed {num_total_pages_indexed} total pages in the category {category}.")
    print(f"Split up of pages indexed per subcategory in {category}: {category_pages_indexed}")

In [32]:
page_graph_creator = Neo4jPageGraphCreator("bolt://localhost:7687", "neo4j", "neo4jpass")
category_graph_creator = Neo4jCategoryGraphCreator("bolt://localhost:7687", "neo4j", "neo4jpass")

index_wiki_data("Dinosaurs", "data/v2/Dinosaurs", page_graph_creator, category_graph_creator, 99)

page_graph_creator.close()
category_graph_creator.close()

Indexed 0 total pages in the category Dinosaurs.
Split up of pages indexed per subcategory in Dinosaurs: {}


In [35]:
w_store.count_documents()

2415

In [27]:
e_store.count_documents()

2415

In [21]:
r.delete("indexed_pages")

1

In [22]:
r.delete("indexed_categories")

1

In [28]:
r.scard("indexed_pages")

109

In [29]:
r.scard("indexed_categories")

9

In [33]:
e_store.filter_documents(filters = {"field": "id", "operator": "==", "value": "263aed0a-7c9a-4dfb-851c-03ca26ec3687"})

[Document(id=263aed0a-7c9a-4dfb-851c-03ca26ec3687, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': 'data/v2/Dinosaurs/Dinosaur.html', 'source_id': '3283d9d1d64425e10055eed8bc2bfb821c10a22b1c4c33964b651d49788b7918', 'split_id': 0, 'title': 'Dinosaur'}, score: 0.0)]

In [34]:
e_store.filter_documents(filters = {"field": "id", "operator": "==", "value": "8be9010c-85a1-40b8-bd5c-95b290b9f71b"})

[Document(id=8be9010c-85a1-40b8-bd5c-95b290b9f71b, content: 'Scientists will probably never be certain of the largest and smallest dinosaurs to have ever existed...', meta: {'file_path': 'data/v2/Dinosaurs/Dinosaur.html', 'source_id': '3283d9d1d64425e10055eed8bc2bfb821c10a22b1c4c33964b651d49788b7918', 'split_id': 201, 'title': 'Dinosaur', 'h2': 'Paleobiology', 'h3': 'Size', 'h4': 'Largest and smallest'}, score: 0.0)]