In [1]:
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack_integrations.components.retrievers.weaviate.embedding_retriever import WeaviateEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever


embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
text_embedder = OpenAITextEmbedder(model="text-embedding-3-small")
weaviate_store = WeaviateDocumentStore(url="http://localhost:8088")
elasticsearch_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")
weaviate_retriever = WeaviateEmbeddingRetriever(document_store=weaviate_store, top_k=3)
elasticsearch_retriever = ElasticsearchBM25Retriever(document_store=elasticsearch_store, top_k=3)

template = """
Answer the question only using the following context. Do not use any external information. 
If the answer is not present in the context, please answer with "I don't know".

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
prompt_builder = PromptBuilder(template=template)

generator = OpenAIGenerator(model="gpt-4o-mini")

  from .autonotebook import tqdm as notebook_tqdm


### Joiner

In [2]:
from haystack import Pipeline
from haystack.components.joiners.document_joiner import DocumentJoiner

joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3)

p = Pipeline()

p.add_component("text_embedder", text_embedder)
p.add_component("weaviate_retriever", weaviate_retriever)
p.add_component("elasticsearch_retriever", elasticsearch_retriever)
p.add_component("joiner", joiner)

p.connect("text_embedder.embedding", "weaviate_retriever.query_embedding")
p.connect("weaviate_retriever", "joiner")
p.connect("elasticsearch_retriever", "joiner")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7daeaf5e2660>
🚅 Components
  - text_embedder: OpenAITextEmbedder
  - weaviate_retriever: WeaviateEmbeddingRetriever
  - elasticsearch_retriever: ElasticsearchBM25Retriever
  - joiner: DocumentJoiner
🛤️ Connections
  - text_embedder.embedding -> weaviate_retriever.query_embedding (List[float])
  - weaviate_retriever.documents -> joiner.documents (List[Document])
  - elasticsearch_retriever.documents -> joiner.documents (List[Document])

In [3]:
query = "How many dinosaur species existed before the extinction event?"

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 10, 'total_tokens': 10}}},
 'joiner': {'documents': [Document(id=433ad671-ea22-4ae1-9bc4-1cce06a0e6ee, content: 'Just before the K-Pg extinction event, the number of non-avian dinosaur species that existed globall...', meta: {'h3': 'Pre-extinction diversity', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 235.0, 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'Extinction of major groups'}, score: 1.0, embedding: vector of size 1536),
   Document(id=3f45e988-b093-4d9c-bde1-67e2e4c3b599, content: 'All non-avian dinosaurs and most lineages of birds became extinct in a mass extinction event, called...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 234, 'title': 'Dinosaurs', 'h2': 'Extinction of major groups'}, score: 0.4919354838709677, embedding: 

Skipping LLM calls...straightforward

### Explore graph calling

In [4]:
query = "Tell me in short about the physiology of dinosaurs."

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 10, 'total_tokens': 10}}},
 'joiner': {'documents': [Document(id=8a7f94b9-3ae4-45a8-90ba-c9f5377c4b5b, content: 'World War II caused a pause in palaeontological research; after the war, research attention was also...', meta: {'h3': '"Dinosaur renaissance" and beyond', 'split_id': 33.0, 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'History of study'}, score: 0.9760624679979518, embedding: vector of size 1536),
   Document(id=3aacf659-c395-452b-92a1-e2e1d2b81ec3, content: 'The popular preoccupation with dinosaurs has ensured their appearance in literature, film, and other...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 244, 'title': 'Dinosaurs', 'h2': 'Cultural depictions'}, score: 0.5, embedding: vector of

One approach
- Loop over all documents returned by Joiner and anchor on corresponding chunks in Neo4j graph
- Find parent of chunk
- Spread out 2 levels to find all non-chunk nodes
- Compare cosine similarity with question???
- Find the most matching node and return all chunks...

Another approach
- If LLM finds that context is not enough to answer the question, it should ask for more context
- For each retrieved chunk, find title node, create page hierarchy from graph using title node and provide the page hierarchy to the LLM
- Let LLM decide the deepest level in the hierarchy which it feels can sufficiently answer the question. Provide all chunks for that deepest level as context for the LLM to answer.

### Construct hierarchy of page given a chunk id

In [43]:
from neo4j import GraphDatabase

class WikiHierarchy:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_hierarchy(self, chunk_id):
        with self.driver.session() as session:
            # Step 1: Find the title node from the chunk ID
            page_node = session.run("""
            MATCH (chunk:Chunk {uuid: $chunk_id})
            OPTIONAL MATCH (page:Page)-[:HAS_SECTION*0..]->(section)-[:HAS_CHUNK]->(chunk)
            WITH page
            WHERE page IS NOT NULL
            RETURN DISTINCT page
            """, chunk_id=chunk_id).single()

            if not page_node:
                return None

            page_node = page_node["page"]
            
            # Step 2: Recursively build the hierarchy
            hierarchy = self.build_hierarchy(session, page_node["uuid"])
            return hierarchy

    def build_hierarchy(self, session, node_uuid):
        # Get the node details
        node = session.run("""
        MATCH (n {uuid: $uuid})
        RETURN n
        """, uuid=node_uuid).single()["n"]

        # Initialize the hierarchy dictionary
        hierarchy = {
            "title": node["title"]
        }

        # Get the sections connected to this node
        sections = session.run("""
        MATCH (n {uuid: $uuid})-[:HAS_SECTION]->(s)
        RETURN s, labels(s) AS labels
        """, uuid=node_uuid)

        section_list = []
        for section in sections:
            section_node = section["s"]
            labels = section["labels"]
            # Determine the type from the labels
            section_type = next(label for label in labels if label in {'h2', 'h3', 'h4'})
            section_hierarchy = {
                "name": section_node["name"],
                "type": section_type
            }
            # Recursively build the hierarchy for subsections
            subsection_hierarchy = self.build_hierarchy(session, section_node["uuid"])
            if "sections" in subsection_hierarchy:
                section_hierarchy["sections"] = subsection_hierarchy["sections"]
            section_list.append(section_hierarchy)

        if section_list:
            hierarchy["sections"] = section_list

        return hierarchy

# Example usage
wiki_hierarchy = WikiHierarchy("bolt://localhost:7687", "neo4j", "neo4jpass")
chunk_id = "3aacf659-c395-452b-92a1-e2e1d2b81ec3"
hierarchy = wiki_hierarchy.get_hierarchy(chunk_id)
wiki_hierarchy.close()

hierarchy

{'title': 'Dinosaur',
 'sections': [{'name': 'Definition',
   'type': 'h2',
   'sections': [{'name': 'General description', 'type': 'h3'},
    {'name': 'Distinguishing anatomical features', 'type': 'h3'}]},
  {'name': 'History of study',
   'type': 'h2',
   'sections': [{'name': 'Pre-scientific history', 'type': 'h3'},
    {'name': 'Early dinosaur research', 'type': 'h3'},
    {'name': 'Discoveries in North America', 'type': 'h3'},
    {'name': '"Dinosaur renaissance" and beyond', 'type': 'h3'},
    {'name': 'Soft tissue and molecular preservation', 'type': 'h3'}]},
  {'name': 'Evolutionary history',
   'type': 'h2',
   'sections': [{'name': 'Origins and early evolution', 'type': 'h3'},
    {'name': 'Evolution and paleobiogeography', 'type': 'h3'}]},
  {'name': 'Classification',
   'type': 'h2',
   'sections': [{'name': 'Taxonomy', 'type': 'h3'},
    {'name': 'Timeline of major groups', 'type': 'h3'}]},
  {'name': 'Paleobiology',
   'type': 'h2',
   'sections': [{'name': 'Size',
     '