### Chunk all wiki data

In [2]:
import sys
import os

# Add the parent directory of 'wiki' to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


from pathlib import Path
from haystack.components.converters import TextFileToDocument
from haystack import Pipeline
from wiki.lib.index.chunk.wiki_page_chunker import WikiPageChunker
import redis
import json

r = redis.Redis(host='localhost', port=6379, db=0)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_title_pathname_map(filepath: str) -> dict:
    metadata_download_path = os.path.join(filepath, ".metadata/download")

    title_pathname_filepath = os.path.join(
        metadata_download_path, "title_pathname.json"
    )
    if not os.path.exists(title_pathname_filepath):
        return {"pages": {}, "categories": {}}
    
    with open(title_pathname_filepath, "r") as file:
        title_pathname = json.load(file)

    return title_pathname

In [4]:
def make_result_serializable(result: dict) -> None:
    """
    The result dictionary has objects of type 'Document' dataclass which is specific to Haystack. Those objects should be 
    flattened to a dictionary so that result dict can be serialized to JSON.
    """
    documents = result["splitter"]["documents"]
    result["splitter"]["documents"] = [doc.to_dict() for doc in documents]


In [7]:
def chunk_page(filepath: str, page_title: str, page_filename: str) -> None:
    """
    Chunk the page and store chunks in .metdata/chunks folder
    """
    page_filepath = os.path.join(filepath, page_filename)
    if not os.path.exists(page_filepath):
        return

    # Run chunk pipeline
    converter = TextFileToDocument()
    splitter = WikiPageChunker()
    
    chunk_pipeline = Pipeline()

    chunk_pipeline.add_component("converter", converter)
    chunk_pipeline.add_component("splitter", splitter)

    chunk_pipeline.connect("converter", "splitter")
    
    result = chunk_pipeline.run(data={"converter": {"sources": [Path(page_filepath)], "meta": {"page_title": page_title}}})
    
    make_result_serializable(result)
    
    # Write chunk results to file
    metadata_chunk_path = os.path.join(filepath, ".metadata/chunk")
    if not os.path.exists(metadata_chunk_path):
        os.makedirs(metadata_chunk_path)
    page_chunk_filepath = os.path.join(
        metadata_chunk_path, f"{page_filename.replace(".html", "")}.json"
    )
    with open(page_chunk_filepath, "w") as file:
        json.dump(result, file)
    

In [8]:
def chunk_wiki_data(category: str, filepath: str, category_pages_chunked: dict) -> int:
    """
    Chunks wiki data for all pages in a category and its subcategories. Downloaded wiki data is available in the
    .metadata/download folder. Chunks and hierarchy information are stored in the .metadata/chunk folder.
    """
    
    num_total_pages_chunked = 0
    title_pathname = get_title_pathname_map(filepath)
    
    pages_filename_set = {file.name for file in Path(filepath).glob("*.html")}
    categories_dirname_set = {dir.name for dir in Path(filepath).iterdir() if dir.is_dir() and dir.name != ".metadata"}
    
    pages = title_pathname["pages"]
    for page_title, page_filename in pages.items():
        if r.sismember("chunked_pages", page_title):
            continue
        if page_filename not in pages_filename_set:
            continue
        chunk_page(filepath, page_title, page_filename)
        num_total_pages_chunked += 1
        r.sadd("chunked_pages", page_title)
    
    if num_total_pages_chunked > 0:
        category_pages_chunked[category] = num_total_pages_chunked
    
    subcategories = title_pathname["categories"]
    for subcategory_title, subcategory_path in subcategories.items():
        if r.sismember("chunked_categories", subcategory_title):
            continue
        if subcategory_path not in categories_dirname_set:
            continue
        subcategory_path = os.path.join(filepath, subcategory_path)
        subcategory_total_pages_chunked = chunk_wiki_data(subcategory_title, subcategory_path, category_pages_chunked)
        num_total_pages_chunked += subcategory_total_pages_chunked
        r.sadd("chunked_categories", subcategory_title)
    
    return num_total_pages_chunked
    

In [9]:
category_pages_chunked = {}
num_total_pages_chunked = chunk_wiki_data("Dinosaurs", "data/v2/Dinosaurs", category_pages_chunked)

***Chunking is pretty fast!***

In [10]:
num_total_pages_chunked

109

In [11]:
category_pages_chunked

{'Dinosaurs': 11,
 'Dinosaur-related lists': 3,
 'Dinosaur paleontology': 3,
 'Dinosaurs in popular culture': 14,
 'Dinosaur taxonomy': 1,
 'Ornithischians': 3,
 'Saurischians': 11,
 'Dinosaur stubs': 63}

In [12]:
r.scard("chunked_pages")

109

In [13]:
r.scard("chunked_categories")

9

In [14]:
chunked_categories = r.smembers("chunked_categories")

# Print all members
for category in chunked_categories:
    print(category.decode('utf-8'))  # Decode bytes to string

Dinosaurs by geologic time unit
Dinosaur-related lists
Dinosaurs by location
Dinosaur paleontology
Dinosaurs in popular culture
Dinosaur taxonomy
Ornithischians
Saurischians
Dinosaur stubs
