### Components

In [1]:
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack_integrations.document_stores.weaviate.document_store import (
    WeaviateDocumentStore,
)
from haystack_integrations.components.retrievers.weaviate.embedding_retriever import (
    WeaviateEmbeddingRetriever,
)
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack_integrations.document_stores.elasticsearch import (
    ElasticsearchDocumentStore,
)
from haystack_integrations.components.retrievers.elasticsearch import (
    ElasticsearchBM25Retriever,
)
from haystack.components.joiners.document_joiner import DocumentJoiner
from lib.templates import p1_qa_template


document_embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
text_embedder = OpenAITextEmbedder(model="text-embedding-3-small")
weaviate_store = WeaviateDocumentStore(url="http://localhost:8088")
elasticsearch_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
weaviate_retriever = WeaviateEmbeddingRetriever(document_store=weaviate_store, top_k=3)
elasticsearch_retriever = ElasticsearchBM25Retriever(
    document_store=elasticsearch_store, top_k=3
)
reciprocal_rank_fusion_joiner = DocumentJoiner(
    join_mode="reciprocal_rank_fusion"
)
p1_qa_prompt_builder = PromptBuilder(template=p1_qa_template)
p1_qa_generator = OpenAIGenerator(model="gpt-4o-mini")

  from .autonotebook import tqdm as notebook_tqdm


### Pipeline

In [2]:
from haystack import Pipeline

hybrid_pipeline = Pipeline()

hybrid_pipeline.add_component("text_embedder", text_embedder)
hybrid_pipeline.add_component("weaviate_retriever", weaviate_retriever)
hybrid_pipeline.add_component("elasticsearch_retriever", elasticsearch_retriever)
hybrid_pipeline.add_component("reciprocal_rank_fusion_joiner", reciprocal_rank_fusion_joiner)
hybrid_pipeline.add_component("p1_qa_prompt_builder", p1_qa_prompt_builder)
hybrid_pipeline.add_component("p1_qa_generator", p1_qa_generator)

hybrid_pipeline.connect("text_embedder.embedding", "weaviate_retriever.query_embedding")
hybrid_pipeline.connect("weaviate_retriever", "reciprocal_rank_fusion_joiner")
hybrid_pipeline.connect("elasticsearch_retriever", "reciprocal_rank_fusion_joiner")
hybrid_pipeline.connect("reciprocal_rank_fusion_joiner.documents", "p1_qa_prompt_builder.documents")
hybrid_pipeline.connect("p1_qa_prompt_builder", "p1_qa_generator")

<haystack.core.pipeline.pipeline.Pipeline object at 0x747070a01190>
🚅 Components
  - text_embedder: OpenAITextEmbedder
  - weaviate_retriever: WeaviateEmbeddingRetriever
  - elasticsearch_retriever: ElasticsearchBM25Retriever
  - reciprocal_rank_fusion_joiner: DocumentJoiner
  - p1_qa_prompt_builder: PromptBuilder
  - p1_qa_generator: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> weaviate_retriever.query_embedding (List[float])
  - weaviate_retriever.documents -> reciprocal_rank_fusion_joiner.documents (List[Document])
  - elasticsearch_retriever.documents -> reciprocal_rank_fusion_joiner.documents (List[Document])
  - reciprocal_rank_fusion_joiner.documents -> p1_qa_prompt_builder.documents (List[Document])
  - p1_qa_prompt_builder.prompt -> p1_qa_generator.prompt (str)

### Pipeline input data

In [3]:
from lib.models.p1_qa import P1QA

p1_qa_schema = P1QA.model_json_schema()

In [10]:
query = "When did dinosaurs go extinct?"

input_data = {
    "elasticsearch_retriever": {"query": query},
    "text_embedder": {"text": query},
    "p1_qa_prompt_builder": {"p1_qa_schema": p1_qa_schema, "query": query},
}

result = hybrid_pipeline.run(data=input_data, include_outputs_from={"elasticsearch_retriever", "weaviate_retriever", "reciprocal_rank_fusion_joiner"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 6, 'total_tokens': 6}}},
 'p1_qa_generator': {'replies': ['{\n  "answer": "Dinosaurs went extinct at the end of the Cretaceous period during the Cretaceous–Paleogene (K-Pg) extinction event, which occurred around 66.038 million years ago.",\n  "need_more_context": false,\n  "reasoning": "The context clearly states the timing of the extinction of dinosaurs, linking it to the K-Pg extinction event and providing an exact date."\n}'],
  'meta': [{'model': 'gpt-4o-mini-2024-07-18',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completion_tokens': 89,
     'prompt_tokens': 1484,
     'total_tokens': 1573,
     'completion_tokens_details': {'reasoning_tokens': 0}}}]},
 'elasticsearch_retriever': {'documents': [Document(id=a0d28395-4798-4c2d-b8a3-9a29dadc09dc, content: 'Some dinosaurs are known to have used gizzard stones like modern birds. These stones are swallowed b...', meta: {'file_p

LLM successfully answers the straightforward question with 'need_more_context': false

In [6]:
query = "Give a brief about the anatomical features of dinosaurs and their size."

input_data = {
    "elasticsearch_retriever": {"query": query},
    "text_embedder": {"text": query},
    "p1_qa_prompt_builder": {"p1_qa_schema": p1_qa_schema, "query": query},
}

result = hybrid_pipeline.run(data=input_data, include_outputs_from={"elasticsearch_retriever", "weaviate_retriever", "reciprocal_rank_fusion_joiner"})

result

{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 14, 'total_tokens': 14}}},
 'p1_qa_generator': {'replies': ['{\n  "answer": "Dinosaurs exhibited a variety of anatomical features, such as modifications to the ancestral archosaurian skeleton and elaborate display structures like horns or crests. Some groups developed additional skeletal modifications, including bony armor and spines. In terms of size, while many dinosaurs were large-bodied, with the largest sauropods reaching lengths of 39.7 meters and heights of 18 meters, there were also many small dinosaurs, some measuring about 50 centimeters in length.",\n  "need_more_context": false,\n  "reasoning": "The context provides sufficient information about the anatomical features of dinosaurs as well as details concerning their size, answering the question comprehensively."\n}'],
  'meta': [{'model': 'gpt-4o-mini-2024-07-18',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completio

In [9]:
query = "Why do children like dinosaurs?"

input_data = {
    "elasticsearch_retriever": {"query": query},
    "text_embedder": {"text": query},
    "p1_qa_prompt_builder": {"p1_qa_schema": p1_qa_schema, "query": query},
}

result = hybrid_pipeline.run(data=input_data, include_outputs_from={"elasticsearch_retriever", "weaviate_retriever", "reciprocal_rank_fusion_joiner"})

result

{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 6, 'total_tokens': 6}}},
 'p1_qa_generator': {'replies': ['{\n  "answer": "The text indicates that dinosaurs have become an enduring part of popular culture and that their large sizes and seemingly monstrous nature have made them regular attractions in best-selling books and films, such as the Jurassic Park franchise. This public enthusiasm, alongside their depiction in literature and media, likely contributes to children\'s fondness for dinosaurs.",\n  "need_more_context": true,\n  "reasoning": "The context provides some reasons related to popular culture and media portrayals of dinosaurs but does not directly address why children specifically like dinosaurs. More information about children\'s preferences or psychological factors would be needed for a complete answer."\n}'],
  'meta': [{'model': 'gpt-4o-mini-2024-07-18',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completion_to

In [8]:
query = "Write a detailed essay about the anatomical features of dinosaurs and their size."

input_data = {
    "elasticsearch_retriever": {"query": query},
    "text_embedder": {"text": query},
    "p1_qa_prompt_builder": {"p1_qa_schema": p1_qa_schema, "query": query},
}

result = hybrid_pipeline.run(data=input_data, include_outputs_from={"elasticsearch_retriever", "weaviate_retriever", "reciprocal_rank_fusion_joiner"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 15, 'total_tokens': 15}}},
 'p1_qa_generator': {'replies': ['{\n  "answer": "Dinosaurs exhibited various anatomical features, notably modifications to the ancestral archosaurian skeleton that are typical for the group, reflecting synapomorphies. These anatomical modifications include elaborate display structures such as horns or crests, along with skeletal adaptations like bony armor and spines found in some groups. In terms of size, dinosaurs varied immensely. While many dinosaurs were large-bodied, such as the sauropod dinosaurs, which reached lengths up to 39.7 meters and heights of 18 meters, there were also many smaller species. The smallest known dinosaur, the bee hummingbird, measured only 5 centimeters in length. Other small non-avian dinosaurs were about the size of pigeons, like Anchiornis huxleyi, which had an estimated weight of 110 grams and a total skeletal length of 34 centimeter

In [11]:
query = "Why did Spinosaurus have a sail on its back?"

input_data = {
    "elasticsearch_retriever": {"query": query},
    "text_embedder": {"text": query},
    "p1_qa_prompt_builder": {"p1_qa_schema": p1_qa_schema, "query": query},
}

result = hybrid_pipeline.run(data=input_data, include_outputs_from={"elasticsearch_retriever", "weaviate_retriever", "reciprocal_rank_fusion_joiner"})

result

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'text_embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 12, 'total_tokens': 12}}},
 'p1_qa_generator': {'replies': ['{\n  "answer": "",\n  "need_more_context": true,\n  "reasoning": "The context does not provide information regarding the reasons behind the presence of a sail on Spinosaurus\'s back."\n}'],
  'meta': [{'model': 'gpt-4o-mini-2024-07-18',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completion_tokens': 43,
     'prompt_tokens': 1609,
     'total_tokens': 1652,
     'completion_tokens_details': {'reasoning_tokens': 0}}}]},
 'elasticsearch_retriever': {'documents': [Document(id=e640edd7-12c7-45ed-9344-3ba7d826aef6, content: 'From a behavioral standpoint, one of the most valuable dinosaur fossils was discovered in the Gobi D...', meta: {'file_path': 'Dinosaur.html', 'source_id': '93000a3fb02b99d2d115cd4042256d2f5db2a0ff3928927ca14465276534a75e', 'split_id': 209, 'title': 'Dinosaurs', 'h2': 'Paleobiology', 'h3': 'Behavior'}, 

### Observations

- After multiple tweaks to the ```p1_qa_template```, results are satisfactory.
- More prompt engineering can result in better responses from the model.
- For now, the model ticks these boxes:
    - if context has no answer, model refrains from giving a made-up answer and provides satisfactory reasoning
    - if context has partial answer, model gives a partial answer AND asks for more context
    - if question is specific and context has the answer, model gives an answer and DOES NOT ask for more context