In [48]:
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack_integrations.components.retrievers.weaviate.embedding_retriever import WeaviateEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever


embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
text_embedder = OpenAITextEmbedder(model="text-embedding-3-small")
weaviate_store = WeaviateDocumentStore(url="http://localhost:8088")
elasticsearch_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")
weaviate_retriever = WeaviateEmbeddingRetriever(document_store=weaviate_store, top_k=3)
elasticsearch_retriever = ElasticsearchBM25Retriever(document_store=elasticsearch_store, top_k=3)

template = """
Answer the question only using the following context. Do not use any external information. 
If the answer is not present in the context, please answer with "I don't know".

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
prompt_builder = PromptBuilder(template=template)

generator = OpenAIGenerator(model="gpt-4o-mini")



### Joiner

In [49]:
from haystack import Pipeline
from haystack.components.joiners.document_joiner import DocumentJoiner

joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3)

p = Pipeline()

p.add_component("text_embedder", text_embedder)
p.add_component("weaviate_retriever", weaviate_retriever)
p.add_component("elasticsearch_retriever", elasticsearch_retriever)
p.add_component("joiner", joiner)

p.connect("text_embedder.embedding", "weaviate_retriever.query_embedding")
p.connect("weaviate_retriever", "joiner")
p.connect("elasticsearch_retriever", "joiner")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7dae9972a180>
🚅 Components
  - text_embedder: OpenAITextEmbedder
  - weaviate_retriever: WeaviateEmbeddingRetriever
  - elasticsearch_retriever: ElasticsearchBM25Retriever
  - joiner: DocumentJoiner
🛤️ Connections
  - text_embedder.embedding -> weaviate_retriever.query_embedding (List[float])
  - weaviate_retriever.documents -> joiner.documents (List[Document])
  - elasticsearch_retriever.documents -> joiner.documents (List[Document])

In [50]:
query = "How many dinosaur species existed before the extinction event?"

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 10, 'total_tokens': 10}}},
 'joiner': {'documents': [Document(id=433ad671-ea22-4ae1-9bc4-1cce06a0e6ee, content: 'Just before the K-Pg extinction event, the number of non-avian dinosaur species that existed globall...', meta: {'h3': 'Pre-extinction diversity', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 235.0, 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'Extinction of major groups'}, score: 1.0, embedding: vector of size 1536),
   Document(id=3f45e988-b093-4d9c-bde1-67e2e4c3b599, content: 'All non-avian dinosaurs and most lineages of birds became extinct in a mass extinction event, called...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 234, 'title': 'Dinosaurs', 'h2': 'Extinction of major groups'}, score: 0.4919354838709677, embedding: 

Skipping LLM calls...straightforward

### Explore graph calling

In [51]:
query = "Tell me in short about the physiology of dinosaurs."

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 10, 'total_tokens': 10}}},
 'joiner': {'documents': [Document(id=8a7f94b9-3ae4-45a8-90ba-c9f5377c4b5b, content: 'World War II caused a pause in palaeontological research; after the war, research attention was also...', meta: {'h3': '"Dinosaur renaissance" and beyond', 'split_id': 33.0, 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'History of study'}, score: 0.9760624679979518, embedding: vector of size 1536),
   Document(id=3aacf659-c395-452b-92a1-e2e1d2b81ec3, content: 'The popular preoccupation with dinosaurs has ensured their appearance in literature, film, and other...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 244, 'title': 'Dinosaurs', 'h2': 'Cultural depictions'}, score: 0.5, embedding: vector of

### When to employ graphs?

One approach
- Loop over all documents returned by Joiner and anchor on corresponding chunks in Neo4j graph
- Find parent of chunk
- Spread out 2 levels to find all non-chunk nodes
- Compare cosine similarity with question???
- Find the most matching node and return all chunks...

Another approach
- If LLM finds that context is not enough to answer the question, it should ask for more context
- For each retrieved chunk, find title node, create page hierarchy from graph using title node and provide the page hierarchy to the LLM
- Let LLM decide the deepest level in the hierarchy which it feels can sufficiently answer the question. Provide all chunks for that deepest level as context for the LLM to answer.

### Construct hierarchy of page given a chunk id

In [52]:
from neo4j import GraphDatabase

class WikiHierarchy:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_hierarchy(self, chunk_id):
        with self.driver.session() as session:
            # Step 1: Find the title node from the chunk ID
            page_node = session.run("""
            MATCH (chunk:Chunk {uuid: $chunk_id})
            OPTIONAL MATCH (page:Page)-[:HAS_SECTION*0..]->(section)-[:HAS_CHUNK]->(chunk)
            WITH page
            WHERE page IS NOT NULL
            RETURN DISTINCT page
            """, chunk_id=chunk_id).single()

            if not page_node:
                return None

            page_node = page_node["page"]
            
            # Step 2: Recursively build the hierarchy
            hierarchy = self.build_hierarchy(session, page_node["uuid"])
            return hierarchy

    def build_hierarchy(self, session, node_uuid):
        # Get the node details
        node = session.run("""
        MATCH (n {uuid: $uuid})
        RETURN n
        """, uuid=node_uuid).single()["n"]

        # Initialize the hierarchy dictionary
        hierarchy = {
            "title": node["title"]
        }

        # Get the sections connected to this node
        sections = session.run("""
        MATCH (n {uuid: $uuid})-[:HAS_SECTION]->(s)
        RETURN s, labels(s) AS labels
        """, uuid=node_uuid)

        section_list = []
        for section in sections:
            section_node = section["s"]
            labels = section["labels"]
            # Determine the type from the labels
            section_type = next(label for label in labels if label in {'h2', 'h3', 'h4'})
            section_hierarchy = {
                "name": section_node["name"],
                "type": section_type
            }
            # Recursively build the hierarchy for subsections
            subsection_hierarchy = self.build_hierarchy(session, section_node["uuid"])
            if "sections" in subsection_hierarchy:
                section_hierarchy["sections"] = subsection_hierarchy["sections"]
            section_list.append(section_hierarchy)

        if section_list:
            hierarchy["sections"] = section_list

        return hierarchy

# Example usage
wiki_hierarchy = WikiHierarchy("bolt://localhost:7687", "neo4j", "neo4jpass")
chunk_id = "3aacf659-c395-452b-92a1-e2e1d2b81ec3"
hierarchy = wiki_hierarchy.get_hierarchy(chunk_id)
wiki_hierarchy.close()

hierarchy

{'title': 'Dinosaur',
 'sections': [{'name': 'Definition',
   'type': 'h2',
   'sections': [{'name': 'General description', 'type': 'h3'},
    {'name': 'Distinguishing anatomical features', 'type': 'h3'}]},
  {'name': 'History of study',
   'type': 'h2',
   'sections': [{'name': 'Pre-scientific history', 'type': 'h3'},
    {'name': 'Early dinosaur research', 'type': 'h3'},
    {'name': 'Discoveries in North America', 'type': 'h3'},
    {'name': '"Dinosaur renaissance" and beyond', 'type': 'h3'},
    {'name': 'Soft tissue and molecular preservation', 'type': 'h3'}]},
  {'name': 'Evolutionary history',
   'type': 'h2',
   'sections': [{'name': 'Origins and early evolution', 'type': 'h3'},
    {'name': 'Evolution and paleobiogeography', 'type': 'h3'}]},
  {'name': 'Classification',
   'type': 'h2',
   'sections': [{'name': 'Taxonomy', 'type': 'h3'},
    {'name': 'Timeline of major groups', 'type': 'h3'}]},
  {'name': 'Paleobiology',
   'type': 'h2',
   'sections': [{'name': 'Size',
     '

## LLM experiments

### Template to encourage LLM to ask for more context

In [53]:
template = """
Answer the question only using the following context. Do not use any external information. 

Answer with "I need more context" in the following situations:
- answer is not present in the context
- answer is present in the context but you need more context to answer the question

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{query}}
Answer:
"""


In [54]:
documents = result["joiner"]["documents"]

llm_pipeline = Pipeline()
llm_pipeline.add_component(instance=PromptBuilder(template=template), name="prompt_builder")
llm_pipeline.add_component("generator", generator)
llm_pipeline.connect("prompt_builder", "generator")

result = llm_pipeline.run({"prompt_builder": {"documents": documents, "query": query}})
print(result)

{'generator': {'replies': ['I need more context.'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 5, 'prompt_tokens': 734, 'total_tokens': 739, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


Sanity check to test whether LLM does not always respond with 'I need more context.'

In [55]:
query = "What are Carnosauria?"

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 6, 'total_tokens': 6}}},
 'joiner': {'documents': [Document(id=f3496c54-52db-4649-b34e-3920c0796e3d, content: '†Carnosauria (large meat-eating dinosaurs; megalosauroids sometimes included)', meta: {'h3': 'Taxonomy', 'split_id': 160.0, 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'Classification'}, score: 1.0, embedding: vector of size 1536),
   Document(id=bceced15-011d-4c78-9be4-168e32244697, content: 'Scientists will probably never be certain of the largest and smallest dinosaurs to have ever existed...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 201, 'title': 'Dinosaurs', 'h2': 'Paleobiology', 'h3': 'Size', 'h4': 'Largest and smallest'}, score: 0.4919354838709677, embedding: vector of size 1536),
   D

In [58]:
documents = result["joiner"]["documents"]

result = llm_pipeline.run({"prompt_builder": {"documents": documents, "query": query}})
print(result)

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  _unclosed_resource_warn(self)
  _deprecation_warn(
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'generator': {'replies': ['Carnosauria are large meat-eating dinosaurs; megalosauroids are sometimes included in this group.'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 23, 'prompt_tokens': 250, 'total_tokens': 273, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


LLM works fine - answers 'I need more context.' only when it needs to.

### Template to provide page hierarchy information to LLM

In [67]:
hierarchy_template = """
The below context provides a Wikipedia page structure in Python dict form - title, h2, h3, h4 sections.
Given the question below and given the relevant page hierarchy, think about the section that would contain the answer to the question.

Example:
If the Dinosaur page has the following structure,
{
    "title": "Dinosaur",
    "sections": [
        {
            "name": "Overview",
            "type": "h2",
            "sections": [
                {
                    "name": "Etymology",
                    "type": "h3"
                }
            ]
        }
    ]
} 
and the section "Etymology" seems to contain the answer to the question "What does the word dinosaur mean?", 
you should repond:
Dinosaur -> Overview -> Etymology

Note: It is not necessary to always go to the lowest level of the hierarchy. For example if the question is broad and 'Overview' seems to contain the answer,
you can respond: Dinosaur -> Overview


Context:
{{hierarchy}}

Question: {{query}}
Response:
"""

In [68]:


hierarchy_pipeline = Pipeline()
hierarchy_pipeline.add_component(instance=PromptBuilder(template=hierarchy_template), name="hierarchy_prompt_builder")
hierarchy_pipeline.add_component(instance=OpenAIGenerator(model="gpt-4o-mini"), name="hierarchy_generator")
hierarchy_pipeline.connect("hierarchy_prompt_builder", "hierarchy_generator")





<haystack.core.pipeline.pipeline.Pipeline object at 0x7dae9a1964e0>
🚅 Components
  - hierarchy_prompt_builder: PromptBuilder
  - hierarchy_generator: OpenAIGenerator
🛤️ Connections
  - hierarchy_prompt_builder.prompt -> hierarchy_generator.prompt (str)

In [69]:
query = "Tell me in short about the physiology of dinosaurs."

result = hierarchy_pipeline.run({"hierarchy_prompt_builder": {"hierarchy": hierarchy, "query": query}})
print(result)

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'hierarchy_generator': {'replies': ['Dinosaur -> Paleobiology -> Physiology'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 9, 'prompt_tokens': 842, 'total_tokens': 851, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


In [70]:
query = "Give me a detailed description about how dinosaurs became extinct."

result = hierarchy_pipeline.run({"hierarchy_prompt_builder": {"hierarchy": hierarchy, "query": query}})
print(result)

{'hierarchy_generator': {'replies': ['Dinosaur -> Extinction of major groups'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 8, 'prompt_tokens': 843, 'total_tokens': 851, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


Good results! LLM goes deep into the hierarchy when required and stops at a higher level if the question is broad enough.

### Fetch relevant chunk ids based on LLM section response

In [71]:
from neo4j import GraphDatabase

class Neo4jClient:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_chunks_by_hierarchy_path(self, path):
        with self.driver.session() as session:
            result = session.run("""
            WITH $path AS path
            MATCH (start:Page {title: path[0]})
            WITH start, path, 1 AS idx
            CALL {
                WITH start, path, idx
                MATCH (current)-[:HAS_SECTION]->(next)
                WHERE current = start AND next.name = path[idx]
                WITH next, path, idx + 1 AS next_idx
                CALL {
                    WITH next, path, next_idx
                    MATCH (next)-[:HAS_SECTION*0..]->(subsection)
                    WHERE subsection.name = path[next_idx]
                    RETURN subsection
                    LIMIT 1
                }
                RETURN subsection
            }
            WITH subsection
            MATCH (subsection)-[:HAS_SECTION*0..]->(subsection)
            WITH subsection
            MATCH (subsection)-[:HAS_CHUNK]->(chunk:Chunk)
            RETURN chunk
            """, path=path)
            return [record["chunk"] for record in result]

# Example usage
neo4j_client = Neo4jClient("bolt://localhost:7687", "neo4j", "neo4jpass")
path = ['Dinosaur', 'Paleobiology', 'Physiology']
chunks = neo4j_client.get_chunks_by_hierarchy_path(path)
neo4j_client.close()

for chunk in chunks:
    print(chunk)



<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:243' labels=frozenset({'Chunk'}) properties={'uuid': '6d8ce2b2-dbcf-43bb-a598-4a61a52029ba'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:242' labels=frozenset({'Chunk'}) properties={'uuid': '2d9d677d-8d1e-43bd-b800-2d536bf38ecb'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:241' labels=frozenset({'Chunk'}) properties={'uuid': '20e258a0-3ef3-413b-89cc-4660319b1847'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:240' labels=frozenset({'Chunk'}) properties={'uuid': '54620459-cf76-480a-accf-e8ca14bb6e91'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:239' labels=frozenset({'Chunk'}) properties={'uuid': '1e97994a-ccc2-4380-8778-b104f0de2ca3'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:238' labels=frozenset({'Chunk'}) properties={'uuid': 'bd119798-6c5e-4805-ae88-147b98d4ba87'}>


This seems fine!

In [72]:
neo4j_client = Neo4jClient("bolt://localhost:7687", "neo4j", "neo4jpass")
path = ['Dinosaur', 'Extinction of major groups']
chunks = neo4j_client.get_chunks_by_hierarchy_path(path)
neo4j_client.close()

for chunk in chunks:
    print(chunk)



This does not seem to work correctly!

Mixed results
- path = ['Dinosaur', 'Paleobiology', 'Physiology'] seems to work fine, got 6 chunks as expected (did not verify if they are the correct chunks)
- path = ['Dinosaur', 'Extinction of major groups'] does not work; expected all chunks under it and chunks of sections/subsections also, but got none! Investigate.