### Imports

In [1]:
import sys
import os

# Add the parent directory of 'wiki' to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import json
import redis
from typing import List, Dict
from pathlib import Path
from haystack import Document
from typing import Tuple
from haystack.document_stores.types import DuplicatePolicy
from wiki.lib.index.graph.page_graph_creator import Neo4jPageGraphCreator
from wiki.lib.index.graph.category_graph_creator import Neo4jCategoryGraphCreator
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore


  from .autonotebook import tqdm as notebook_tqdm


### Global variables (kind of)

In [2]:
r = redis.Redis(host='localhost', port=6379, db=0)
embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
w_store = WeaviateDocumentStore(url="http://localhost:8088")
w_writer = DocumentWriter(document_store=w_store, policy=DuplicatePolicy.SKIP)
e_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")
e_writer = DocumentWriter(document_store=e_store, policy=DuplicatePolicy.SKIP)


### Download

In [9]:
r.scard("downloaded_pages")

3760

In [10]:
r.scard("downloaded_categories")

371

### Chunk

In [11]:
r.scard("chunked_pages")

3761

In [12]:
r.scard("chunked_categories")

371

### Index

In [20]:
r.scard("indexed_pages")

3761

In [21]:
r.scard("indexed_categories")

371

### DELETE INDEXED REDIS SET

In [16]:
r.delete("indexed_pages")

1

In [17]:
r.delete("indexed_categories")

1

### Weaviate

In [22]:
w_store.count_documents()

77759

### Elasticsearch

In [23]:
e_store.count_documents()

77759

In [24]:
e_store.filter_documents(filters = {"field": "id", "operator": "==", "value": "d78ae765-7bae-4279-a37f-b0e85d9d3265"})

[Document(id=d78ae765-7bae-4279-a37f-b0e85d9d3265, content: 'Daemonosaurus was named by Hans-Dieter Sues, Sterling J. Nesbitt, David S. Berman and Amy C. Henrici...', meta: {'file_path': '/aux/data/wiki/v3000/Dinosaurs/Dinosaurs_by_geologic_time_unit/Mesozoic_dinosaurs/Mesozoic_dinosaurs_of_North_America/Triassic_dinosaurs_of_North_America/Late_Triassic_dinosaurs_of_North_America/Daemonosaurus.html', 'source_id': '6ebcda616c731658766d043c4fb846836b2abffda76eb8ffa52f2af056d2e754', 'split_id': 2, 'title': 'Daemonosaurus', 'h2': 'Discovery'}, score: 0.0)]

In [25]:
e_store.filter_documents(filters = {"field": "id", "operator": "==", "value": "48275e94-9b70-4bc2-9e25-30195078ac8f"})

[Document(id=48275e94-9b70-4bc2-9e25-30195078ac8f, content: 'The family Mamenchisauridae is found widespread throughout Asia. A majority of the genera are found ...', meta: {'file_path': '/aux/data/wiki/v3000/Dinosaurs/Saurischians/Sauropodomorphs/Massopoda/Sauropods/Eusauropoda.html', 'source_id': '5a9e41f5d59b37a4443d403380ddf00b11753be0255e3a569bbcd5ab92b628c0', 'split_id': 11, 'title': 'Eusauropoda', 'h2': 'Distribution'}, score: 0.0)]

In [26]:

e_store.filter_documents(filters = {"field": "id", "operator": "==", "value": "ba6b0326-1c5f-4cfa-9d84-8a7924ac30dc"})


[Document(id=ba6b0326-1c5f-4cfa-9d84-8a7924ac30dc, content: 'The proximodorsal process is a feature of the skeleton of archosaurs. It may be a pair of tabs or bl...', meta: {'file_path': '/aux/data/wiki/v3000/Dinosaurs/Dinosaurs_by_geologic_time_unit/Mesozoic_dinosaurs/Proximodorsal_process.html', 'source_id': 'bcc02638baead7daa42a3b45a9393d598c56e914a2a18a93c36e4cdd1bbfb220', 'split_id': 0, 'title': 'Proximodorsal process'}, score: 0.0)]

In [29]:

e_store.filter_documents(filters = {"field": "id", "operator": "==", "value": "0654bcf7-0682-4196-b9fb-6493c6db0e63"})


[Document(id=0654bcf7-0682-4196-b9fb-6493c6db0e63, content: 'Dinosaurs are a diverse group of reptiles of the clade Dinosauria . They first appeared during the T...', meta: {'file_path': '/aux/data/wiki/v3000/Dinosaurs/Dinosaur.html', 'source_id': '67149effeb007f8f932e140439ae0dda5ab02dcc2494696f920fb494905be33b', 'split_id': 0, 'title': 'Dinosaur'}, score: 0.0)]

In [30]:

e_store.filter_documents(filters = {"field": "id", "operator": "==", "value": "a2ea22fa-4f16-4405-9706-5862c1371da3"})


[Document(id=a2ea22fa-4f16-4405-9706-5862c1371da3, content: 'The first dinosaur fossils were recognized in the early 19th century, with the name "dinosaur" (mean...', meta: {'file_path': '/aux/data/wiki/v3000/Dinosaurs/Dinosaur.html', 'source_id': '67149effeb007f8f932e140439ae0dda5ab02dcc2494696f920fb494905be33b', 'split_id': 3, 'title': 'Dinosaur'}, score: 0.0)]