In [7]:
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack_integrations.components.retrievers.weaviate.embedding_retriever import WeaviateEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator

embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
text_embedder = OpenAITextEmbedder(model="text-embedding-3-small")
document_store = WeaviateDocumentStore(url="http://localhost:8088")
retriever = WeaviateEmbeddingRetriever(document_store=document_store, top_k=3)

template = """
Answer the question only using the following context. Do not use any external information. 
If the answer is not present in the context, please answer with "I don't know".

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
prompt_builder = PromptBuilder(template=template)

generator = OpenAIGenerator(model="gpt-4o-mini")


## Manually embed query and fetch context

Embed

In [14]:
from haystack import Document

question = "What caused dinosaurs to become extinct?"
document = Document(content=question)

result = embedder.run(documents=[document])

embedded_document = result["documents"][0]
embedding = embedded_document.embedding

print("Embedding:", embedding)


Calculating embeddings: 100%|██████████| 1/1 [00:10<00:00, 10.50s/it]

Embedding: [0.014867931604385376, 0.028368912637233734, 0.01067345216870308, -0.0061653233133256435, -0.034735534340143204, 0.00874474085867405, 0.010448748245835304, 0.00855280552059412, -0.00333311315625906, 0.02207719348371029, 0.025691187009215355, -0.011385016143321991, -0.04239420220255852, -0.028050580993294716, -0.012246382422745228, -0.0303350742906332, 0.02460511587560177, 0.049247682094573975, -0.0741524025797844, -0.0014254676643759012, -0.012545987963676453, 3.32996787619777e-05, 0.004789009690284729, 0.01717115007340908, 0.010102328844368458, 0.023725025355815887, 0.019642896950244904, -0.009554612450301647, -0.024230608716607094, -0.03585905581712723, 0.012433636002242565, -0.02207719348371029, 0.008056583814322948, 0.030147820711135864, -0.0185661893337965, -0.01339799165725708, 0.009287776425480843, 0.04430418834090233, 0.0042529962956905365, -0.03379926458001137, 0.04902298003435135, -0.0494723878800869, -0.0033377944491803646, 0.036065034568309784, -0.020560439676046




Retrieve

In [15]:
retrieval_result = retriever.run(query_embedding=embedding)

print(retrieval_result)

retrieved_documents = retrieval_result["documents"]

for doc in retrieved_documents:
    print(doc.content)

{'documents': [Document(id=f547552c-85a1-44e7-a0ab-1195cf30b70f, content: 'All non-avian dinosaurs and most lineages of birds became extinct in a mass extinction event, called...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 112.0, 'file_path': 'dinosaur-page.html'}, score: 0.8354368209838867, embedding: vector of size 1536), Document(id=bda7c905-e2f1-48b5-a2c2-8804c3082bf5, content: 'When dinosaurs appeared, they were not the dominant terrestrial animals. The terrestrial habitats we...', meta: {'split_id': 42.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.8048534393310547, embedding: vector of size 1536), Document(id=47f9c1b0-233a-4f8f-9c3f-12fa3e45c0c1, content: 'The Cretaceous–Paleogene extinction event, which occurred approximately 66 million years ago at the ...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3

In [16]:
retrieved_documents

[Document(id=f547552c-85a1-44e7-a0ab-1195cf30b70f, content: 'All non-avian dinosaurs and most lineages of birds became extinct in a mass extinction event, called...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 112.0, 'file_path': 'dinosaur-page.html'}, score: 0.8354368209838867, embedding: vector of size 1536),
 Document(id=bda7c905-e2f1-48b5-a2c2-8804c3082bf5, content: 'When dinosaurs appeared, they were not the dominant terrestrial animals. The terrestrial habitats we...', meta: {'split_id': 42.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.8048534393310547, embedding: vector of size 1536),
 Document(id=47f9c1b0-233a-4f8f-9c3f-12fa3e45c0c1, content: 'The Cretaceous–Paleogene extinction event, which occurred approximately 66 million years ago at the ...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split

> **Warning:** There is an issue with embeddings. Two chunks with the same content exist. Investigate! (fixed later)

In [13]:
print(document_store.count_documents())

270


> Embeddings were added twice, once in notebook 5_embed_and_store.ipynb and later in 6_indexing_pipeline.ipynb

Generate

In [17]:
context = "\n".join([doc.content for doc in retrieved_documents])

prompt = f"Question: {question}\nContext: {context}\nAnswer:"

generation_result = generator.run(prompt=prompt)

generated_replies = generation_result["replies"]
for reply in generated_replies:
    print("Generated Reply:", reply)

Generated Reply: Dinosaurs became extinct primarily due to the Cretaceous–Paleogene (K-Pg) extinction event, which occurred around 66 million years ago. This event was likely caused by a combination of two significant factors: an extraterrestrial impact, such as the asteroid that struck the Yucatán Peninsula, and extensive volcanic activity, particularly from the Deccan Traps in India. These catastrophic events led to dramatic environmental changes, including altered climate conditions, habitat destruction, and disrupted food chains, which significantly impacted the survival of non-avian dinosaurs and many other species.

In addition to these events, factors such as the large size of most dinosaur species, their low diversity at the end of the Cretaceous, and specific ecological dependencies (like those of some bird lineages on forest habitats) may have further contributed to their extinction. In total, about 76% of species on Earth became extinct during this mass extinction event, alo

In [8]:
from haystack import Pipeline

query_pipeline = Pipeline()

query_pipeline.add_component("text_embedder", text_embedder)
query_pipeline.add_component("retriever", retriever)
query_pipeline.add_component("prompt_builder", prompt_builder)
query_pipeline.add_component("llm", generator)

query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever", "prompt_builder.documents")
query_pipeline.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7a0b588e2000>
🚅 Components
  - text_embedder: OpenAITextEmbedder
  - retriever: WeaviateEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [12]:
question = "Why are dinosaurs so fascinating?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])


  timestamp = datetime.utcnow().replace(tzinfo=tzutc())


Dinosaurs are fascinating due to their fantastic appearance and often enormous size, which capture the popular imagination. Their regular appearances in best-selling books and films, along with the persistent public enthusiasm that generates significant funding for dinosaur science, contribute to their enduring cultural importance. The term "dinosaur" itself has entered common vernacular to describe anything impractically large, obsolete, or bound for extinction, further reflecting their impact on human culture.


### Experiments

**Exp-1**

The word 'Gymnosperm' occurs only once in the 'Dinosaur' wiki page - it occurs in hte 3rd or 4th sentence of a passage chunk of about 10 sentences. 

Test if the embeddings are able to capture the word in the semantic meaning.

In [4]:
question = "How are gymnosperm plants related to dinosaurs?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


Gymnosperm plants, particularly conifers, were a potential food source for dinosaurs in the Late Triassic, as they radiated during this time. This availability of food sources likely played a role in the evolution and dietary practices of early dinosaurs, especially herbivorous species like the early sauropodomorphs.


Not bad. Satisfactory answer!

**Exp-2**



In [5]:
question = "What is Proceratosauridae?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())


Proceratosauridae refers to a group of tyrannosauroids with head crests.


Amazing! The word 'Proceratosauridae' occurs deep inside a very long chunk - was not expecting a pure vector embedding based retrieval to do the job. Suspected that a hybrid strategy (vector + BM25) may be required. 

**Exp-3**

In [6]:
question = "What is Averostra?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


Averostra refers to a clade of theropod dinosaurs characterized by "bird snouts."


Ok the answer makes sense. But how did the LLM know that Averostra refers to a clade of theropod dinosaurs? The only reference to Averostra is the following - "Averostra ("bird snouts")". 

Modified the prompt to emphasize LLM not to use external information and to reply with "I don't know" if answer not in context.

Run manually to check which chunks are fetched.

In [9]:
from haystack import Document

question = "What is Averostra?"

document = Document(content=question)
result = embedder.run(documents=[document])
embedded_document = result["documents"][0]
embedding = embedded_document.embedding

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]


In [10]:
retrieval_result = retriever.run(query_embedding=embedding)
print(retrieval_result)
retrieved_documents = retrieval_result["documents"]
for doc in retrieved_documents:
    print(doc.content)

{'documents': [Document(id=d060a7dc-17f6-43e6-9b3a-4365480a5543, content: '- †Ornithischia ("bird-hipped"; diverse bipedal and quadrupedal herbivores) - †Saphornithischia ("tr...', meta: {'split_id': 57.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.6415892839431763, embedding: vector of size 1536), Document(id=294a6ad7-9781-4bfb-9de4-5d1dc44d9c85, content: '- †Sauropodomorpha (herbivores with small heads, long necks, and long tails) - †Unaysauridae (primit...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 59.0, 'file_path': 'dinosaur-page.html'}, score: 0.6411151885986328, embedding: vector of size 1536), Document(id=0617d027-ae05-4321-921c-44b5dd2a18f3, content: '- †Alvarezsauroidea (small hunters with reduced forelimbs) - †Alvarezsauridae (insectivores with onl...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c

The chunk which has the answer is ranked 2 out of 3. Also the score of the top ranked chunk ~ 0.64, so not a very confident retrieval(?). What if we could give only one chunk as context?

In [11]:
retrieval_result = retriever.run(query_embedding=embedding, top_k=1)
print(retrieval_result)
retrieved_documents = retrieval_result["documents"]
for doc in retrieved_documents:
    print(doc.content)

{'documents': [Document(id=d060a7dc-17f6-43e6-9b3a-4365480a5543, content: '- †Ornithischia ("bird-hipped"; diverse bipedal and quadrupedal herbivores) - †Saphornithischia ("tr...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 57.0, 'file_path': 'dinosaur-page.html'}, score: 0.6415892839431763, embedding: vector of size 1536)]}
- †Ornithischia ("bird-hipped"; diverse bipedal and quadrupedal herbivores) - †Saphornithischia ("true" ornithischians) - †Heterodontosauridae (small herbivores/omnivores with prominent canine-like teeth) - †Genasauria ("cheeked lizards") - †Thyreophora (armored dinosaurs; bipeds and quadrupeds) - †Eurypoda (heavy, quadrupedal thyreophorans) - †Stegosauria (spikes and plates as primary armor) - †Huayangosauridae (small stegosaurs with flank osteoderms and tail clubs) - †Stegosauridae (large stegosaurs) - †Ankylosauria (scutes as primary armor) - †Parankylosauria (small, southern ankylosaurs with macuahuitl-li

The answer does not exist in the top ranked chunk. What does the LLM model say now?

In [14]:

context = "\n".join([doc.content for doc in retrieved_documents])

prompt = (
    f"Answer the question only using the following context. "
    f"Do not use any external information. If the answer is not present in the context, "
    f"please answer with \"I don't know\".\n\n"
    f"Question: {question}\n"
    f"Context:\n{context}\n"
    f"Answer:"
)
generation_result = generator.run(prompt=prompt)

generated_replies = generation_result["replies"]
for reply in generated_replies:
    print("Generated Reply:", reply)

Generated Reply: I don't know.


As expected, the LLM responds with "I don't know" as the answer does not exist in the context of the only chunk that was retrieved.

This is a good opportunity to test BM25 based retrieval.

In [15]:
from haystack_integrations.components.retrievers.weaviate.bm25_retriever import WeaviateBM25Retriever


bm25retriever = WeaviateBM25Retriever(document_store=document_store)

In [18]:
bm25_retrieval_result = bm25retriever.run(query=question, top_k=1)
print(bm25_retrieval_result)
bm25_retrieved_documents = bm25_retrieval_result["documents"]
for doc in bm25_retrieved_documents:
    print(doc.content)

{'documents': [Document(id=a0ec45c3-c0cd-4604-a176-594456f11b36, content: 'Scholarly descriptions of what would now be recognized as dinosaur bones first appeared in the late ...', meta: {'split_id': 23.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 1.9453471899032593, embedding: vector of size 1536)]}
Scholarly descriptions of what would now be recognized as dinosaur bones first appeared in the late 17th century in England. Part of a bone, now known to have been the femur of a Megalosaurus , was recovered from a limestone quarry at Cornwell near Chipping Norton, Oxfordshire, in 1676. The fragment was sent to Robert Plot, Professor of Chemistry at the University of Oxford and first curator of the Ashmolean Museum, who published a description in his The Natural History of Oxford-shire (1677). He correctly identified the bone as the lower extremity of the femur of a large animal, and recognized that it was to

In [17]:
question

'What is Averostra?'

Unsatisfactory! Expected BM25 based embedding to return the proper chunk which had the term 'Averostra'.

In [19]:
bm25_retrieval_result = bm25retriever.run(query="What is MRCA", top_k=1)
print(bm25_retrieval_result)
bm25_retrieved_documents = bm25_retrieval_result["documents"]
for doc in bm25_retrieved_documents:
    print(doc.content)

{'documents': [Document(id=7ea8178b-0ad5-4b37-80bf-aa925decf8bb, content: 'Under phylogenetic nomenclature, dinosaurs are usually defined as the group consisting of the most r...', meta: {'split_id': 5.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 2.5518620014190674, embedding: vector of size 1536)]}
Under phylogenetic nomenclature, dinosaurs are usually defined as the group consisting of the most recent common ancestor (MRCA) of Triceratops and modern birds (Neornithes), and all its descendants. It has also been suggested that Dinosauria be defined with respect to the MRCA of Megalosaurus and Iguanodon , because these were two of the three genera cited by Richard Owen when he recognized the Dinosauria. Both definitions cover the same known genera: Dinosauria = Ornithischia + Saurischia. This includes major groups such as ankylosaurians (armored herbivorous quadrupeds), stegosaurians (plated herbivorous qu

But BM25 is able to find the term 'MRCA' as expected. So mixed results!

Next, let's check if 'MRCA' is found in single chunk vectore embedding retrieval.