In [71]:
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack_integrations.components.retrievers.weaviate.embedding_retriever import WeaviateEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever


embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
text_embedder = OpenAITextEmbedder(model="text-embedding-3-small")
weaviate_store = WeaviateDocumentStore(url="http://localhost:8088")
elasticsearch_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")
weaviate_retriever = WeaviateEmbeddingRetriever(document_store=weaviate_store, top_k=3)
elasticsearch_retriever = ElasticsearchBM25Retriever(document_store=elasticsearch_store, top_k=3)

template = """
Answer the question only using the following context. Do not use any external information. 
If the answer is not present in the context, please answer with "I don't know".

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
prompt_builder = PromptBuilder(template=template)

generator = OpenAIGenerator(model="gpt-4o-mini")



### Joiner

In [72]:
from haystack import Pipeline
from haystack.components.joiners.document_joiner import DocumentJoiner

joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3)

p = Pipeline()

p.add_component("text_embedder", text_embedder)
p.add_component("weaviate_retriever", weaviate_retriever)
p.add_component("elasticsearch_retriever", elasticsearch_retriever)
p.add_component("joiner", joiner)

p.connect("text_embedder.embedding", "weaviate_retriever.query_embedding")
p.connect("weaviate_retriever", "joiner")
p.connect("elasticsearch_retriever", "joiner")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7928168806b0>
🚅 Components
  - text_embedder: OpenAITextEmbedder
  - weaviate_retriever: WeaviateEmbeddingRetriever
  - elasticsearch_retriever: ElasticsearchBM25Retriever
  - joiner: DocumentJoiner
🛤️ Connections
  - text_embedder.embedding -> weaviate_retriever.query_embedding (List[float])
  - weaviate_retriever.documents -> joiner.documents (List[Document])
  - elasticsearch_retriever.documents -> joiner.documents (List[Document])

In [50]:
query = "How many dinosaur species existed before the extinction event?"

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 10, 'total_tokens': 10}}},
 'joiner': {'documents': [Document(id=433ad671-ea22-4ae1-9bc4-1cce06a0e6ee, content: 'Just before the K-Pg extinction event, the number of non-avian dinosaur species that existed globall...', meta: {'h3': 'Pre-extinction diversity', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 235.0, 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'Extinction of major groups'}, score: 1.0, embedding: vector of size 1536),
   Document(id=3f45e988-b093-4d9c-bde1-67e2e4c3b599, content: 'All non-avian dinosaurs and most lineages of birds became extinct in a mass extinction event, called...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 234, 'title': 'Dinosaurs', 'h2': 'Extinction of major groups'}, score: 0.4919354838709677, embedding: 

Skipping LLM calls...straightforward

### Explore graph calling

In [73]:
query = "Tell me in short about the physiology of dinosaurs."

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 10, 'total_tokens': 10}}},
 'joiner': {'documents': [Document(id=8a7f94b9-3ae4-45a8-90ba-c9f5377c4b5b, content: 'World War II caused a pause in palaeontological research; after the war, research attention was also...', meta: {'h3': '"Dinosaur renaissance" and beyond', 'split_id': 33.0, 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'History of study'}, score: 0.9760624679979518, embedding: vector of size 1536),
   Document(id=3aacf659-c395-452b-92a1-e2e1d2b81ec3, content: 'The popular preoccupation with dinosaurs has ensured their appearance in literature, film, and other...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 244, 'title': 'Dinosaurs', 'h2': 'Cultural depictions'}, score: 0.5, embedding: vector of

### When to employ graphs?

One approach
- Loop over all documents returned by Joiner and anchor on corresponding chunks in Neo4j graph
- Find parent of chunk
- Spread out 2 levels to find all non-chunk nodes
- Compare cosine similarity with question???
- Find the most matching node and return all chunks...

Another approach (***SELECTED***)
- If LLM finds that context is not enough to answer the question, it should ask for more context
- For each retrieved chunk, find title node, create page hierarchy from graph using title node and provide the page hierarchy to the LLM
- Let LLM decide the deepest level in the hierarchy which it feels can sufficiently answer the question. Provide all chunks for that deepest level as context for the LLM to answer.

### Construct hierarchy of page given a chunk id

In [74]:
from neo4j import GraphDatabase

class WikiHierarchy:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_hierarchy(self, chunk_id):
        with self.driver.session() as session:
            # Step 1: Find the title node from the chunk ID
            page_node = session.run("""
            MATCH (chunk:Chunk {uuid: $chunk_id})
            OPTIONAL MATCH (page:Page)-[:HAS_SECTION*0..]->(section)-[:HAS_CHUNK]->(chunk)
            WITH page
            WHERE page IS NOT NULL
            RETURN DISTINCT page
            """, chunk_id=chunk_id).single()

            if not page_node:
                return None

            page_node = page_node["page"]
            
            # Step 2: Recursively build the hierarchy
            hierarchy = self.build_hierarchy(session, page_node["uuid"])
            return hierarchy

    def build_hierarchy(self, session, node_uuid):
        # Get the node details
        node = session.run("""
        MATCH (n {uuid: $uuid})
        RETURN n
        """, uuid=node_uuid).single()["n"]

        # Initialize the hierarchy dictionary
        hierarchy = {
            "title": node["title"]
        }

        # Get the sections connected to this node
        sections = session.run("""
        MATCH (n {uuid: $uuid})-[:HAS_SECTION]->(s)
        RETURN s, labels(s) AS labels
        """, uuid=node_uuid)

        section_list = []
        for section in sections:
            section_node = section["s"]
            labels = section["labels"]
            # Determine the type from the labels
            section_type = next(label for label in labels if label in {'h2', 'h3', 'h4'})
            section_hierarchy = {
                "name": section_node["name"],
                "type": section_type
            }
            # Recursively build the hierarchy for subsections
            subsection_hierarchy = self.build_hierarchy(session, section_node["uuid"])
            if "sections" in subsection_hierarchy:
                section_hierarchy["sections"] = subsection_hierarchy["sections"]
            section_list.append(section_hierarchy)

        if section_list:
            hierarchy["sections"] = section_list

        return hierarchy

# Example usage
wiki_hierarchy = WikiHierarchy("bolt://localhost:7687", "neo4j", "neo4jpass")
chunk_id = "3aacf659-c395-452b-92a1-e2e1d2b81ec3"
hierarchy = wiki_hierarchy.get_hierarchy(chunk_id)
wiki_hierarchy.close()

hierarchy

{'title': 'Dinosaur',
 'sections': [{'name': 'Definition',
   'type': 'h2',
   'sections': [{'name': 'General description', 'type': 'h3'},
    {'name': 'Distinguishing anatomical features', 'type': 'h3'}]},
  {'name': 'History of study',
   'type': 'h2',
   'sections': [{'name': 'Pre-scientific history', 'type': 'h3'},
    {'name': 'Early dinosaur research', 'type': 'h3'},
    {'name': 'Discoveries in North America', 'type': 'h3'},
    {'name': '"Dinosaur renaissance" and beyond', 'type': 'h3'},
    {'name': 'Soft tissue and molecular preservation', 'type': 'h3'}]},
  {'name': 'Evolutionary history',
   'type': 'h2',
   'sections': [{'name': 'Origins and early evolution', 'type': 'h3'},
    {'name': 'Evolution and paleobiogeography', 'type': 'h3'}]},
  {'name': 'Classification',
   'type': 'h2',
   'sections': [{'name': 'Taxonomy', 'type': 'h3'},
    {'name': 'Timeline of major groups', 'type': 'h3'}]},
  {'name': 'Paleobiology',
   'type': 'h2',
   'sections': [{'name': 'Size',
     '

## LLM experiments

### Template to encourage LLM to ask for more context

In [75]:
template = """
Answer the question only using the following context. Do not use any external information. 

Answer with "I need more context" in the following situations:
- answer is not present in the context
- answer is present in the context but you need more context to answer the question

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{query}}
Answer:
"""


In [76]:
documents = result["joiner"]["documents"]

llm_pipeline = Pipeline()
llm_pipeline.add_component(instance=PromptBuilder(template=template), name="prompt_builder")
llm_pipeline.add_component("generator", generator)
llm_pipeline.connect("prompt_builder", "generator")

result = llm_pipeline.run({"prompt_builder": {"documents": documents, "query": query}})
print(result)

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())


{'generator': {'replies': ['I need more context.'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 5, 'prompt_tokens': 734, 'total_tokens': 739, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


Sanity check to test whether LLM does not always respond with 'I need more context.'

In [55]:
query = "What are Carnosauria?"

result = p.run(data={"elasticsearch_retriever": {"query": query}, 
            "text_embedder": {"text": query}}, include_outputs_from={"weaviate_retriever", "elasticsearch_retriever"})

result

{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 6, 'total_tokens': 6}}},
 'joiner': {'documents': [Document(id=f3496c54-52db-4649-b34e-3920c0796e3d, content: '†Carnosauria (large meat-eating dinosaurs; megalosauroids sometimes included)', meta: {'h3': 'Taxonomy', 'split_id': 160.0, 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'file_path': 'Dinosaur.html', 'title': 'Dinosaurs', 'h4': None, 'h2': 'Classification'}, score: 1.0, embedding: vector of size 1536),
   Document(id=bceced15-011d-4c78-9be4-168e32244697, content: 'Scientists will probably never be certain of the largest and smallest dinosaurs to have ever existed...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 201, 'title': 'Dinosaurs', 'h2': 'Paleobiology', 'h3': 'Size', 'h4': 'Largest and smallest'}, score: 0.4919354838709677, embedding: vector of size 1536),
   D

In [58]:
documents = result["joiner"]["documents"]

result = llm_pipeline.run({"prompt_builder": {"documents": documents, "query": query}})
print(result)

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  _unclosed_resource_warn(self)
  _deprecation_warn(
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  _unclosed_resource_warn(self)
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'generator': {'replies': ['Carnosauria are large meat-eating dinosaurs; megalosauroids are sometimes included in this group.'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 23, 'prompt_tokens': 250, 'total_tokens': 273, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


LLM works fine - answers 'I need more context.' only when it needs to.

### Template to provide page hierarchy information to LLM

In [77]:
hierarchy_template = """
The below context provides a Wikipedia page structure in Python dict form - title, h2, h3, h4 sections.
Given the question below and given the relevant page hierarchy, think about the section that would contain the answer to the question.

Example:
If the Dinosaur page has the following structure,
{
    "title": "Dinosaur",
    "sections": [
        {
            "name": "Overview",
            "type": "h2",
            "sections": [
                {
                    "name": "Etymology",
                    "type": "h3"
                }
            ]
        }
    ]
} 
and the section "Etymology" seems to contain the answer to the question "What does the word dinosaur mean?", 
you should repond:
Dinosaur -> Overview -> Etymology

Note: It is not necessary to always go to the lowest level of the hierarchy. For example if the question is broad and 'Overview' seems to contain the answer,
you can respond: Dinosaur -> Overview


Context:
{{hierarchy}}

Question: {{query}}
Response:
"""

In [78]:


hierarchy_pipeline = Pipeline()
hierarchy_pipeline.add_component(instance=PromptBuilder(template=hierarchy_template), name="hierarchy_prompt_builder")
hierarchy_pipeline.add_component(instance=OpenAIGenerator(model="gpt-4o-mini"), name="hierarchy_generator")
hierarchy_pipeline.connect("hierarchy_prompt_builder", "hierarchy_generator")





<haystack.core.pipeline.pipeline.Pipeline object at 0x7928165f69c0>
🚅 Components
  - hierarchy_prompt_builder: PromptBuilder
  - hierarchy_generator: OpenAIGenerator
🛤️ Connections
  - hierarchy_prompt_builder.prompt -> hierarchy_generator.prompt (str)

In [79]:
query = "Tell me in short about the physiology of dinosaurs."

result = hierarchy_pipeline.run({"hierarchy_prompt_builder": {"hierarchy": hierarchy, "query": query}})
print(result)

{'hierarchy_generator': {'replies': ['Dinosaur -> Paleobiology -> Physiology'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 9, 'prompt_tokens': 842, 'total_tokens': 851, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


In [70]:
query = "Give me a detailed description about how dinosaurs became extinct."

result = hierarchy_pipeline.run({"hierarchy_prompt_builder": {"hierarchy": hierarchy, "query": query}})
print(result)

{'hierarchy_generator': {'replies': ['Dinosaur -> Extinction of major groups'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 8, 'prompt_tokens': 843, 'total_tokens': 851, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


Good results! LLM goes deep into the hierarchy when required and stops at a higher level if the question is broad enough.

### Fetch relevant chunk ids based on LLM section response (FAILED)

In [1]:
from neo4j import GraphDatabase

class Neo4jClient:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_chunks_by_hierarchy_path(self, path):
        with self.driver.session() as session:
            result = session.run("""
            WITH $path AS path
            MATCH (start:Page {title: path[0]})
            WITH start, path, 1 AS idx
            CALL {
                WITH start, path, idx
                MATCH (current)-[:HAS_SECTION]->(next)
                WHERE current = start AND next.name = path[idx]
                WITH next, path, idx + 1 AS next_idx
                CALL {
                    WITH next, path, next_idx
                    MATCH (next)-[:HAS_SECTION*0..]->(subsection)
                    WHERE subsection.name = path[next_idx]
                    RETURN subsection
                    LIMIT 1
                }
                RETURN subsection
            }
            WITH subsection
            MATCH (subsection)-[:HAS_SECTION*0..]->(subsection)
            WITH subsection
            MATCH (subsection)-[:HAS_CHUNK]->(chunk:Chunk)
            RETURN chunk
            """, path=path)
            return [record["chunk"] for record in result]

# Example usage
neo4j_client = Neo4jClient("bolt://localhost:7687", "neo4j", "neo4jpass")
path = ['Dinosaur', 'Paleobiology', 'Physiology']
chunks = neo4j_client.get_chunks_by_hierarchy_path(path)
neo4j_client.close()

for chunk in chunks:
    print(chunk)



<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:243' labels=frozenset({'Chunk'}) properties={'uuid': '6d8ce2b2-dbcf-43bb-a598-4a61a52029ba'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:242' labels=frozenset({'Chunk'}) properties={'uuid': '2d9d677d-8d1e-43bd-b800-2d536bf38ecb'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:241' labels=frozenset({'Chunk'}) properties={'uuid': '20e258a0-3ef3-413b-89cc-4660319b1847'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:240' labels=frozenset({'Chunk'}) properties={'uuid': '54620459-cf76-480a-accf-e8ca14bb6e91'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:239' labels=frozenset({'Chunk'}) properties={'uuid': '1e97994a-ccc2-4380-8778-b104f0de2ca3'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:238' labels=frozenset({'Chunk'}) properties={'uuid': 'bd119798-6c5e-4805-ae88-147b98d4ba87'}>


This seems fine!

In [3]:
neo4j_client = Neo4jClient("bolt://localhost:7687", "neo4j", "neo4jpass")
path = ['Dinosaur', 'Extinction of major groups', 'Impact event']
chunks = neo4j_client.get_chunks_by_hierarchy_path(path)
neo4j_client.close()

for chunk in chunks:
    print(chunk)



<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:263' labels=frozenset({'Chunk'}) properties={'uuid': 'bf4dc5c0-a5b0-44a3-b401-7816d62462a2'}>
<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:262' labels=frozenset({'Chunk'}) properties={'uuid': '78553c50-0783-43df-8f8a-9e81b90562a9'}>


This does not seem to work correctly!

Mixed results
- path = ['Dinosaur', 'Paleobiology', 'Physiology'] seems to work fine, got 6 chunks as expected (did not verify if they are the correct chunks)
- path = ['Dinosaur', 'Extinction of major groups'] does not work; expected all chunks under it and chunks of sections/subsections also, but got none! Investigate.
    - but path = ['Dinosaur', 'Extinction of major groups', 'Impact event'] works! - buggy code, if path ends midway (not at a leaf section node), code logic does not fetch all chunks

In [13]:
from neo4j import GraphDatabase

class Neo4jClient:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_chunks_by_hierarchy_path(self, path):
        with self.driver.session() as session:
            result = session.run("""
            WITH $path AS path
            // Start with the first element in the path as the Page node
            MATCH (page:Page {title: path[0]})
            WITH page, path, 1 AS idx

            // Traverse the sections along the path, if path contains more than just the page
            CALL {
                WITH page, path, idx
                OPTIONAL MATCH (page)-[:HAS_SECTION*0..]->(section)
                WHERE (size(path) = 1 AND section IS NOT NULL) OR (size(path) > 1 AND section.name = path[idx])
                WITH section, path, idx + 1 AS next_idx
                RETURN section, next_idx
                LIMIT 1
            } 

            // If no sections are found, default to the page node itself for hierarchy collection
            WITH COALESCE(section, page) AS current, page, path

            // Collect chunks directly under this section or page
            OPTIONAL MATCH (current)-[:HAS_CHUNK]->(chunk:Chunk)
            WITH current, collect(chunk.uuid) AS chunks

            // Recursively collect chunks from subsections (if any)
            CALL {
                WITH current
                OPTIONAL MATCH (current)-[:HAS_SECTION*0..]->(subsection)
                OPTIONAL MATCH (subsection)-[:HAS_CHUNK]->(subchunk:Chunk)
                RETURN subsection.name AS subsection_name, collect(subchunk.uuid) AS subsection_chunks
            }

            // Group subsections and their chunks
            WITH current, chunks, collect({subsection: subsection_name, chunks: subsection_chunks}) AS subsection_hierarchy

            // Filter out subsections that are the same as the top-level section (avoid redundancy)
            WITH current, chunks, [subsection IN subsection_hierarchy WHERE subsection.subsection <> current.name] AS filtered_hierarchy

            // Return chunks for this section/page and its subsections in a hierarchical format
            RETURN current.name AS section_name, chunks, filtered_hierarchy
            """, path=path)

            return [record for record in result]


# Example usage
neo4j_client = Neo4jClient("bolt://localhost:7687", "neo4j", "neo4jpass")
path = ['Dinosaur', 'Paleobiology']
chunks_hierarchy = neo4j_client.get_chunks_by_hierarchy_path(path)
neo4j_client.close()

# Print out the hierarchical chunks
for hierarchy in chunks_hierarchy:
    print(hierarchy)




<Record section_name='Paleobiology' chunks=['ed090260-bd01-484a-8029-03523e3ac3fd'] filtered_hierarchy=[{'chunks': ['44a23537-cdcb-4cb0-a256-011cbafe37ae', '51cfc07d-fa24-431c-a3aa-84bd5e6d6743'], 'subsection': 'Size'}, {'chunks': ['f39087f3-654f-42c8-b7b6-b9a671d028ad', 'f15756e5-41b7-4bc7-b2cc-0a3aae739e62', '5c19df63-6a0e-4d11-bfa5-825bbcbbc2fc', '7a844fcb-32a5-4dbf-bfd7-097d78f5a299', 'bceced15-011d-4c78-9be4-168e32244697'], 'subsection': 'Largest and smallest'}, {'chunks': ['72964dd2-a5a8-4f60-a1ce-6ab52a861eb6', 'ea809a68-4f9d-4e83-8b06-df558603f98e', 'e640edd7-12c7-45ed-9344-3ba7d826aef6', '5cc76edb-2468-48f3-b4a9-8e3f5874afeb', 'd42331a1-c62b-421b-ade7-7f9d41f371fe', 'b78f9d4b-eedd-43cf-bd90-8b446803e94d'], 'subsection': 'Behavior'}, {'chunks': ['27a7eded-5134-44c3-9507-88492413124e', '6fef6db7-2835-4fc5-8c42-e8359c72f96d', '4d2007ce-8213-498c-86ae-5dc7cf7aa3a2'], 'subsection': 'Communication'}, {'chunks': ['1094ab6d-3a32-4be5-bbec-90388fea8bcc', 'ee8d6417-efc7-40eb-bf56-233900

In [14]:
neo4j_client = Neo4jClient("bolt://localhost:7687", "neo4j", "neo4jpass")
path = ['Dinosaur']
chunks_hierarchy = neo4j_client.get_chunks_by_hierarchy_path(path)
neo4j_client.close()

chunks_hierarchy



[<Record section_name=None chunks=['298c4052-ebd4-4de9-a15e-49400409b4c6', 'fb334a4d-0c1c-4a4d-98a7-ec1db6a5ecca', 'b5bb2eb7-c529-4f14-a4da-99ea0ce636bf', 'a143ea79-f5d1-450e-ae15-6f7ce5034d8f'] filtered_hierarchy=[]>]

***FAILED EXPERIMENT - Given a section path from page title, unable to satisfactorily fetch section, sub-sections and chunks data. The chunks are fetched properly, but hierarchy is lost (flat list of chunks, not nested). Spending some more time with the cypher query will surely yield results. But ii is perhaps more prudent to create the hierarchy of chunks data during an earlier step when we are constructing the section hierarchy anyway.***

### Construct hierarchy of page given a chunk id - v2: construct hierarchy with chunks info

In [80]:
from neo4j import GraphDatabase

class WikiHierarchy:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_hierarchy(self, chunk_id):
        with self.driver.session() as session:
            # Step 1: Find the title node from the chunk ID
            page_node = session.run("""
            MATCH (chunk:Chunk {uuid: $chunk_id})
            OPTIONAL MATCH (page:Page)-[:HAS_SECTION*0..]->(section)-[:HAS_CHUNK]->(chunk)
            WITH page
            WHERE page IS NOT NULL
            RETURN DISTINCT page
            """, chunk_id=chunk_id).single()

            if not page_node:
                return None, None

            page_node = page_node["page"]
            
            print(page_node)

            # Step 2: Recursively build the hierarchies
            section_hierarchy, chunks_hierarchy = self.build_hierarchy(session, page_node["uuid"])
            return section_hierarchy, chunks_hierarchy

    def build_hierarchy(self, session, node_uuid):
        # Get the node details
        node = session.run("""
        MATCH (n {uuid: $uuid})
        RETURN n
        """, uuid=node_uuid).single()["n"]

        # Initialize the hierarchies
        section_hierarchy = {
            "title": node["title"]
        }
        chunks_hierarchy = {
            "title": node["title"],
            "chunks": []
        }

        # Get the sections connected to this node
        sections = session.run("""
        MATCH (n {uuid: $uuid})-[:HAS_SECTION]->(s)
        RETURN s, labels(s) AS labels
        """, uuid=node_uuid)

        section_list = []
        chunks_list = []
        for section in sections:
            section_node = section["s"]
            labels = section["labels"]
            # Determine the type from the labels
            section_type = next(label for label in labels if label in {'h2', 'h3', 'h4'})
            section_hierarchy_entry = {
                "name": section_node["name"],
                "type": section_type
            }
            chunks_hierarchy_entry = {
                "name": section_node["name"],
                "type": section_type,
                "chunks": []
            }
            # Recursively build the hierarchy for subsections
            subsection_section_hierarchy, subsection_chunks_hierarchy = self.build_hierarchy(session, section_node["uuid"])
            if "sections" in subsection_section_hierarchy:
                section_hierarchy_entry["sections"] = subsection_section_hierarchy["sections"]
            if "sections" in subsection_chunks_hierarchy:
                chunks_hierarchy_entry["sections"] = subsection_chunks_hierarchy["sections"]
            section_list.append(section_hierarchy_entry)
            chunks_list.append(chunks_hierarchy_entry)

            # Get the chunks directly connected to this section node
            section_chunks = session.run("""
            MATCH (s {uuid: $uuid})-[:FIRST_CHUNK]->(first_chunk)
            OPTIONAL MATCH path = (first_chunk)-[:NEXT*]->(c)
            WITH first_chunk, collect(c) AS subsequent_chunks
            WITH [first_chunk] + subsequent_chunks AS chunks
            UNWIND chunks AS chunk
            RETURN chunk
            """, uuid=section_node["uuid"])
            for chunk in section_chunks:
                chunk_node = chunk["chunk"]
                chunks_hierarchy_entry["chunks"].append(chunk_node["uuid"])

        # Get the chunks directly connected to this node
        chunks = session.run("""
        MATCH (n {uuid: $uuid})-[:FIRST_CHUNK]->(first_chunk)
        OPTIONAL MATCH path = (first_chunk)-[:NEXT*]->(c)
        WITH first_chunk, collect(c) AS subsequent_chunks
        WITH [first_chunk] + subsequent_chunks AS chunks
        UNWIND chunks AS chunk
        RETURN chunk
        """, uuid=node_uuid)
        for chunk in chunks:
            chunk_node = chunk["chunk"]
            chunks_hierarchy["chunks"].append(chunk_node["uuid"])

        if section_list:
            section_hierarchy["sections"] = section_list
        if chunks_list:
            chunks_hierarchy["sections"] = chunks_list

        return section_hierarchy, chunks_hierarchy

# Example usage
wiki_hierarchy = WikiHierarchy("bolt://localhost:7687", "neo4j", "neo4jpass")
chunk_id = "3aacf659-c395-452b-92a1-e2e1d2b81ec3"
section_hierarchy, chunks_hierarchy = wiki_hierarchy.get_hierarchy(chunk_id)
wiki_hierarchy.close()


<Node element_id='4:61326856-46d2-4d6e-9ccb-3d3370b8bf1f:0' labels=frozenset({'Page'}) properties={'title': 'Dinosaur', 'uuid': '3d987749-5549-484d-a373-ee8241785e48'}>


In [81]:
print("Section Hierarchy:")
section_hierarchy

Section Hierarchy:


{'title': 'Dinosaur',
 'sections': [{'name': 'Definition',
   'type': 'h2',
   'sections': [{'name': 'General description', 'type': 'h3'},
    {'name': 'Distinguishing anatomical features', 'type': 'h3'}]},
  {'name': 'History of study',
   'type': 'h2',
   'sections': [{'name': 'Pre-scientific history', 'type': 'h3'},
    {'name': 'Early dinosaur research', 'type': 'h3'},
    {'name': 'Discoveries in North America', 'type': 'h3'},
    {'name': '"Dinosaur renaissance" and beyond', 'type': 'h3'},
    {'name': 'Soft tissue and molecular preservation', 'type': 'h3'}]},
  {'name': 'Evolutionary history',
   'type': 'h2',
   'sections': [{'name': 'Origins and early evolution', 'type': 'h3'},
    {'name': 'Evolution and paleobiogeography', 'type': 'h3'}]},
  {'name': 'Classification',
   'type': 'h2',
   'sections': [{'name': 'Taxonomy', 'type': 'h3'},
    {'name': 'Timeline of major groups', 'type': 'h3'}]},
  {'name': 'Paleobiology',
   'type': 'h2',
   'sections': [{'name': 'Size',
     '

In [82]:
print("\nChunks Hierarchy:")
chunks_hierarchy


Chunks Hierarchy:


{'title': 'Dinosaur',
 'chunks': ['a143ea79-f5d1-450e-ae15-6f7ce5034d8f',
  'b5bb2eb7-c529-4f14-a4da-99ea0ce636bf',
  'fb334a4d-0c1c-4a4d-98a7-ec1db6a5ecca',
  '298c4052-ebd4-4de9-a15e-49400409b4c6'],
 'sections': [{'name': 'Definition',
   'type': 'h2',
   'chunks': ['0c9e9c40-8387-41cd-a484-1ba20183ddb8',
    'fddc9347-5b5a-441a-828b-46e14f1c5a05',
    'cc1356da-f729-45bb-909e-ad1890e46993'],
   'sections': [{'name': 'General description',
     'type': 'h3',
     'chunks': ['a3e88b32-83b0-439e-bbb3-3d007c047c69',
      'f05fdf12-63e9-466f-9bdf-a3a7a46c1352',
      'dabe7759-3d78-4e0c-a9fe-8488df94c89b']},
    {'name': 'Distinguishing anatomical features',
     'type': 'h3',
     'chunks': ['e3301398-887c-4cc7-a10f-3299dd420a88',
      '567d7f5d-adc4-42c1-9774-1bb172553d8b',
      '7a16aafb-10ec-4552-8eb4-1d7fe2ad2b79',
      'c0bb0ab0-06b4-403d-968d-1bd8e34594bd',
      '1d0a398b-8661-4b31-b4a5-cb7d87e29c78',
      '3bacb7b3-51d1-4963-ba83-3d5f9c3c30f5',
      '5df12407-a53f-4422-bc5

This works! Both hierarchies can be constructed in a single stage:
- section hierarchy: needed as context for LLM to suggest a section path which might hold the answer to the user question
- chunk hierarchy: needed to pull out chunk ids once the LLM responds with the desired section path

Next steps:
- fetch text content of all relevant chunks from elasticsearch haystack store
- give hierarchical content text to LLM: may be useful when a broader question is asked 

### Populate chunk hierarchy dict with chunk content

1. Get relevant part of chunks_hierarchy

In [83]:
def get_hierarchy_by_path_recursive(chunks_hierarchy, sections):
    # Base case: if no more sections to traverse, return the current level
    if not sections:
        return chunks_hierarchy
    
    # Get the current section to look for
    current_section = sections[0]
    
    # Check if the current level has subsections
    if "sections" in chunks_hierarchy:
        for sub_section in chunks_hierarchy["sections"]:
            if sub_section["name"] == current_section:
                # Recursively call the function with the remaining sections
                return get_hierarchy_by_path_recursive(sub_section, sections[1:])
    
    # If the section is not found, return None
    return None

def get_hierarchy_by_path(chunks_hierarchy, path):
    # Split the path
    nodes = path.split(' -> ')
    # Remove the first element (title node)
    sections = nodes[1:]
    # Call the recursive function
    return get_hierarchy_by_path_recursive(chunks_hierarchy, sections)



In [63]:
path = "Dinosaur -> Paleobiology -> Size"
filtered_chunk_hierarchy = get_hierarchy_by_path(chunks_hierarchy, path)

filtered_chunk_hierarchy

{'name': 'Size',
 'type': 'h3',
 'chunks': ['51cfc07d-fa24-431c-a3aa-84bd5e6d6743',
  '44a23537-cdcb-4cb0-a256-011cbafe37ae'],
 'sections': [{'name': 'Largest and smallest',
   'type': 'h4',
   'chunks': ['bceced15-011d-4c78-9be4-168e32244697',
    '7a844fcb-32a5-4dbf-bfd7-097d78f5a299',
    '5c19df63-6a0e-4d11-bfa5-825bbcbbc2fc',
    'f15756e5-41b7-4bc7-b2cc-0a3aae739e62',
    'f39087f3-654f-42c8-b7b6-b9a671d028ad']}]}

Works fine.

2. Fetch Documents from elasticsearch store corresponding to the chunks

In [64]:
def extract_chunks(chunks_hierarchy):
    # Initialize an empty list to store chunk IDs
    chunks_list = []

    # Recursive function to traverse the hierarchy and collect chunk IDs
    def traverse_hierarchy(hierarchy):
        # Add the chunks at the current level to the list
        if "chunks" in hierarchy:
            chunks_list.extend(hierarchy["chunks"])
        
        # Recursively traverse the sections
        if "sections" in hierarchy:
            for section in hierarchy["sections"]:
                traverse_hierarchy(section)
    
    # Start the traversal from the root of the hierarchy
    traverse_hierarchy(chunks_hierarchy)
    
    return chunks_list

filtered_chunks_flat_list = extract_chunks(filtered_chunk_hierarchy)

filtered_chunks_flat_list

['51cfc07d-fa24-431c-a3aa-84bd5e6d6743',
 '44a23537-cdcb-4cb0-a256-011cbafe37ae',
 'bceced15-011d-4c78-9be4-168e32244697',
 '7a844fcb-32a5-4dbf-bfd7-097d78f5a299',
 '5c19df63-6a0e-4d11-bfa5-825bbcbbc2fc',
 'f15756e5-41b7-4bc7-b2cc-0a3aae739e62',
 'f39087f3-654f-42c8-b7b6-b9a671d028ad']

In [65]:
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore

e_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")

In [66]:
def fetch_documents_from_elasticsearch(flat_chunks_list):
    # Construct the filter for fetching documents
    filters = {
        "operator": "OR",
        "conditions": [{"field": "id", "operator": "==", "value": chunk_id} for chunk_id in flat_chunks_list]
    }
    
    docs = e_store.filter_documents(filters=filters)
    
    return docs

fetched_docs = fetch_documents_from_elasticsearch(filtered_chunks_flat_list)

fetched_docs

[Document(id=51cfc07d-fa24-431c-a3aa-84bd5e6d6743, content: 'Current evidence suggests that dinosaur average size varied through the Triassic, Early Jurassic, La...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 199, 'title': 'Dinosaurs', 'h2': 'Paleobiology', 'h3': 'Size'}, score: 0.0, embedding: vector of size 1536),
 Document(id=44a23537-cdcb-4cb0-a256-011cbafe37ae, content: 'The sauropods were the largest and heaviest dinosaurs. For much of the dinosaur era, the smallest sa...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 200, 'title': 'Dinosaurs', 'h2': 'Paleobiology', 'h3': 'Size'}, score: 0.0, embedding: vector of size 1536),
 Document(id=bceced15-011d-4c78-9be4-168e32244697, content: 'Scientists will probably never be certain of the largest and smallest dinosaurs to have ever existed...', meta: {'file_path': '

3. Replace chunk ids with chunk content in filtered dict

In [67]:
def replace_chunk_ids_with_content(chunk_hierarchy, chunk_docs):
    # Create a mapping of chunk IDs to document content
    chunk_id_to_content = {doc.id: doc.content for doc in chunk_docs}

    # Recursive function to traverse the hierarchy and construct plain text output
    def traverse_and_construct_text(hierarchy, level=0):
        output = []
        indent = '  ' * level  # Indentation based on the hierarchy level

        # Add the section title and type
        if "name" in hierarchy and "type" in hierarchy:
            output.append(f"{indent}{hierarchy['name']} ({hierarchy['type']}):")

        # Add the chunks content as a paragraph
        if "chunks" in hierarchy:
            chunk_contents = [chunk_id_to_content.get(chunk_id, f"Missing content for chunk {chunk_id}") for chunk_id in hierarchy["chunks"]]
            indented_chunk_contents = [f"{indent}{content}" for content in chunk_contents]
            output.append('\n\n'.join(indented_chunk_contents))  # Add extra newlines between chunks

        # Recursively traverse the sections
        if "sections" in hierarchy:
            for section in hierarchy["sections"]:
                output.append('\n\n' + traverse_and_construct_text(section, level + 1))  # Add extra newlines between sections

        return '\n'.join(output)

    # Start the traversal and construction from the root of the hierarchy
    return traverse_and_construct_text(chunk_hierarchy)

In [68]:
chunk_hierarchy_docs = replace_chunk_ids_with_content(filtered_chunk_hierarchy, fetched_docs)

print(chunk_hierarchy_docs)

Size (h3):
Current evidence suggests that dinosaur average size varied through the Triassic, Early Jurassic, Late Jurassic and Cretaceous. Predatory theropod dinosaurs, which occupied most terrestrial carnivore niches during the Mesozoic, most often fall into the 100-to-1,000 kg (220-to-2,200 lb) category when sorted by estimated weight into categories based on order of magnitude, whereas recent predatory carnivoran mammals peak in the 10-to-100 kg (22-to-220 lb) category. The mode of Mesozoic dinosaur body masses is between 1 and 10 metric tons (1.1 and 11.0 short tons). This contrasts sharply with the average size of Cenozoic mammals, estimated by the National Museum of Natural History as about 2 to 5 kg (4.4 to 11.0 lb).

The sauropods were the largest and heaviest dinosaurs. For much of the dinosaur era, the smallest sauropods were larger than anything else in their habitat, and the largest was an order of magnitude more massive than anything else that has since walked the Earth. G

### Given expanded text context to LLM

In [84]:
path = "Dinosaur -> Paleobiology -> Physiology"
filtered_chunk_hierarchy = get_hierarchy_by_path(chunks_hierarchy, path)
filtered_chunks_flat_list = extract_chunks(filtered_chunk_hierarchy)
fetched_docs = fetch_documents_from_elasticsearch(filtered_chunks_flat_list)
chunk_hierarchy_docs = replace_chunk_ids_with_content(filtered_chunk_hierarchy, fetched_docs)

print(chunk_hierarchy_docs)

Physiology (h3):
Because both modern crocodilians and birds have four-chambered hearts (albeit modified in crocodilians), it is likely that this is a trait shared by all archosaurs, including all dinosaurs. While all modern birds have high metabolisms and are endothermic ("warm-blooded"), a vigorous debate has been ongoing since the 1960s regarding how far back in the dinosaur lineage this trait extended. Various researchers have supported dinosaurs as being endothermic, ectothermic ("cold-blooded"), or somewhere in between. An emerging consensus among researchers is that, while different lineages of dinosaurs would have had different metabolisms, most of them had higher metabolic rates than other reptiles but lower than living birds and mammals, which is termed mesothermy by some. Evidence from crocodiles and their extinct relatives suggests that such elevated metabolisms could have developed in the earliest archosaurs, which were the common ancestors of dinosaurs and crocodiles.

Aft

In [86]:
from haystack import Document

physiology_doc = Document(content=chunk_hierarchy_docs, meta={"name": "Physiology"})

result = llm_pipeline.run({"prompt_builder": {"documents": [physiology_doc], "query": query}})
print(result)

{'generator': {'replies': ['I need more context.'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 5, 'prompt_tokens': 1211, 'total_tokens': 1216, 'completion_tokens_details': {'reasoning_tokens': 0}}}]}}


***Dubious results!!!***

LLM responds with 'I need more context.' even though enough context is provided. Possibly because of the presence of the following instruction in the context:

> Answer with "I need more context" in the following situations:
> - answer is not present in the context
> - answer is present in the context but you need more context to answer the question

LLM got greedy may be and decided it should ask for as much context as possible before answering (if that choice is provided). 

Let's try to change the template and not give the LLM a chance to ask for more context (because we cannot provide any more context).

In [88]:
template_full_context = """
Answer the question only using the following context. Do not use any external information. If answer is not present in the context,
please answer with "I don't know".

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{query}}
Answer:
"""

In [91]:
full_context_llm_pipeline = Pipeline()
full_context_llm_pipeline.add_component(instance=PromptBuilder(template=template_full_context), name="full_context_prompt_builder")
full_context_llm_pipeline.add_component("full_context_generator", OpenAIGenerator(model="gpt-4o-mini"))
full_context_llm_pipeline.connect("full_context_prompt_builder", "full_context_generator")

result = full_context_llm_pipeline.run({"full_context_prompt_builder": {"documents": [physiology_doc], "query": query}}, include_outputs_from={"full_context_prompt_builder"})
print(result)

{'full_context_generator': {'replies': ['Dinosaurs likely had four-chambered hearts, similar to modern crocodilians and birds, suggesting they may have been endothermic or had varying metabolic rates. Evidence suggests most dinosaurs had higher metabolic rates than living reptiles but lower than birds and mammals, termed mesothermy. They may have exhibited fast growth and endothermy as indicated by fibrolamellar bone, feathers, stable internal temperatures, and polar habitation. Saurischians had an avian respiratory system, allowing for higher oxygen intake and activity levels. Dinosaurs excreted nitrogenous wastes as uric acid, helping conserve water, and their brain sizes were comparable to expected ratios based on body size, indicating they were not necessarily sluggish or unintelligent.'], 'meta': [{'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 145, 'prompt_tokens': 1188, 'total_tokens': 1333, 'completion_tokens_details': {'r

In [94]:
print(result["full_context_generator"]["replies"][0])

Dinosaurs likely had four-chambered hearts, similar to modern crocodilians and birds, suggesting they may have been endothermic or had varying metabolic rates. Evidence suggests most dinosaurs had higher metabolic rates than living reptiles but lower than birds and mammals, termed mesothermy. They may have exhibited fast growth and endothermy as indicated by fibrolamellar bone, feathers, stable internal temperatures, and polar habitation. Saurischians had an avian respiratory system, allowing for higher oxygen intake and activity levels. Dinosaurs excreted nitrogenous wastes as uric acid, helping conserve water, and their brain sizes were comparable to expected ratios based on body size, indicating they were not necessarily sluggish or unintelligent.


Good result! LLM seems to have picked out information from all chunks provided, which seems like the logical thing to do when a broad question like this is asked.