In [6]:
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack_integrations.components.retrievers.weaviate.embedding_retriever import WeaviateEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator

embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
text_embedder = OpenAITextEmbedder(model="text-embedding-3-small")
document_store = WeaviateDocumentStore(url="http://localhost:8088")
retriever = WeaviateEmbeddingRetriever(document_store=document_store, top_k=3)

template = """
Answer the question only using the following context. Do not use any external information. 
If the answer is not present in the context, please answer with "I don't know".

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
prompt_builder = PromptBuilder(template=template)

generator = OpenAIGenerator(model="gpt-4o-mini")


## Manually embed query and fetch context

Embed

In [7]:
from haystack import Document

question = "What caused dinosaurs to become extinct?"
document = Document(content=question)

result = embedder.run(documents=[document])

embedded_document = result["documents"][0]
embedding = embedded_document.embedding

print("Embedding:", embedding)


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.93it/s]

Embedding: [0.014891155064105988, 0.028340928256511688, 0.01061380933970213, -0.006153950002044439, -0.03472419083118439, 0.008737205527722836, 0.01048277411609888, 0.008540653623640537, -0.003322665113955736, 0.022088702768087387, 0.02568279765546322, -0.011362578719854355, -0.042417798191308975, -0.02806014008820057, -0.012242383323609829, -0.030381325632333755, 0.024559643119573593, 0.04919416084885597, -0.07412818819284439, -0.0014250021195039153, -0.012560609728097916, 2.420339296804741e-05, 0.004785105586051941, 0.017193621024489403, 0.010117748752236366, 0.023698557168245316, 0.019692640751600266, -0.009602970443665981, -0.02422269620001316, -0.035866063088178635, 0.012476373463869095, -0.022069983184337616, 0.008067992515861988, 0.030100537464022636, -0.018541406840085983, -0.013356178067624569, 0.009298782795667648, 0.044327158480882645, 0.0042773461900651455, -0.033788226544857025, 0.048969533294439316, -0.04941879212856293, -0.0033530837390571833, 0.036034535616636276, -0.02




Retrieve

In [8]:
retrieval_result = retriever.run(query_embedding=embedding)

print(retrieval_result)

retrieved_documents = retrieval_result["documents"]

for doc in retrieved_documents:
    print(doc.content)

{'documents': [Document(id=f547552c-85a1-44e7-a0ab-1195cf30b70f, content: 'All non-avian dinosaurs and most lineages of birds became extinct in a mass extinction event, called...', meta: {'split_id': 112.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.8353686332702637, embedding: vector of size 1536), Document(id=bda7c905-e2f1-48b5-a2c2-8804c3082bf5, content: 'When dinosaurs appeared, they were not the dominant terrestrial animals. The terrestrial habitats we...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 42.0, 'file_path': 'dinosaur-page.html'}, score: 0.8047947883605957, embedding: vector of size 1536), Document(id=47f9c1b0-233a-4f8f-9c3f-12fa3e45c0c1, content: 'The Cretaceous–Paleogene extinction event, which occurred approximately 66 million years ago at the ...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3

In [9]:
retrieved_documents

[Document(id=f547552c-85a1-44e7-a0ab-1195cf30b70f, content: 'All non-avian dinosaurs and most lineages of birds became extinct in a mass extinction event, called...', meta: {'split_id': 112.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.8353686332702637, embedding: vector of size 1536),
 Document(id=bda7c905-e2f1-48b5-a2c2-8804c3082bf5, content: 'When dinosaurs appeared, they were not the dominant terrestrial animals. The terrestrial habitats we...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 42.0, 'file_path': 'dinosaur-page.html'}, score: 0.8047947883605957, embedding: vector of size 1536),
 Document(id=47f9c1b0-233a-4f8f-9c3f-12fa3e45c0c1, content: 'The Cretaceous–Paleogene extinction event, which occurred approximately 66 million years ago at the ...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split

> **Warning:** There is an issue with embeddings. Two chunks with the same content exist. Investigate! (fixed later)

In [10]:
print(document_store.count_documents())

135


> Embeddings were added twice, once in notebook 5_embed_and_store.ipynb and later in 6_indexing_pipeline.ipynb

Generate

In [11]:
context = "\n".join([doc.content for doc in retrieved_documents])

prompt = f"Question: {question}\nContext: {context}\nAnswer:"

generation_result = generator.run(prompt=prompt)

generated_replies = generation_result["replies"]
for reply in generated_replies:
    print("Generated Reply:", reply)

Generated Reply: The extinction of dinosaurs was primarily caused by the Cretaceous–Paleogene (K-Pg) extinction event, which occurred around 66 million years ago. The leading mechanisms responsible for this mass extinction are believed to be an extraterrestrial impact event (specifically a large asteroid impact in the Yucatán Peninsula) and extensive volcanic activity (flood basalt volcanism) in India. These catastrophic events drastically affected the Earth's climate and ecosystems, leading to the abrupt disappearance of non-avian dinosaurs and many other species. This extinction event resulted in the loss of approximately 76% of species on Earth, including various groups of animals such as pterosaurs and marine reptiles. Contributing factors to the dinosaurs' vulnerability included their relatively large size and low diversity, especially of smaller-bodied species, which may have limited their adaptability to the changing environment.


In [12]:
from haystack import Pipeline

query_pipeline = Pipeline()

query_pipeline.add_component("text_embedder", text_embedder)
query_pipeline.add_component("retriever", retriever)
query_pipeline.add_component("prompt_builder", prompt_builder)
query_pipeline.add_component("llm", generator)

query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever", "prompt_builder.documents")
query_pipeline.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x794f14df5a30>
🚅 Components
  - text_embedder: OpenAITextEmbedder
  - retriever: WeaviateEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [13]:
question = "Why are dinosaurs so fascinating?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])


  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


Dinosaurs are fascinating due to their fantastic appearance, often enormous size, and their ability to capture the popular imagination. Their presence in best-selling books and films, as well as persistent public enthusiasm, contribute to their cultural importance and enduring appeal.


### Experiments

**Exp-1**

The word 'Gymnosperm' occurs only once in the 'Dinosaur' wiki page - it occurs in hte 3rd or 4th sentence of a passage chunk of about 10 sentences. 

Test if the embeddings are able to capture the word in the semantic meaning.

In [14]:
question = "How are gymnosperm plants related to dinosaurs?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

Gymnosperm plants, particularly conifers, served as a potential food source for dinosaurs during the Late Triassic, which played a role in dinosaur evolution.


Not bad. Satisfactory answer!

**Exp-2**



In [15]:
question = "What is Proceratosauridae?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

Proceratosauridae are tyrannosauroids with head crests.


Amazing! The word 'Proceratosauridae' occurs deep inside a very long chunk - was not expecting a pure vector embedding based retrieval to do the job. Suspected that a hybrid strategy (vector + BM25) may be required. 

**Exp-3**

In [16]:
question = "What is Averostra?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

Averostra ("bird snouts") is a group within the theropods characterized by having bird-like snouts.


Ok the answer makes sense. But how did the LLM know that Averostra refers to a clade of theropod dinosaurs? The only reference to Averostra is the following - "Averostra ("bird snouts")". 

Modified the prompt to emphasize LLM not to use external information and to reply with "I don't know" if answer not in context.

Run manually to check which chunks are fetched.

In [17]:
from haystack import Document

question = "What is Averostra?"

document = Document(content=question)
result = embedder.run(documents=[document])
embedded_document = result["documents"][0]
embedding = embedded_document.embedding

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s]


In [18]:
retrieval_result = retriever.run(query_embedding=embedding)
print(retrieval_result)
retrieved_documents = retrieval_result["documents"]
for doc in retrieved_documents:
    print(doc.content)

{'documents': [Document(id=d060a7dc-17f6-43e6-9b3a-4365480a5543, content: '- †Ornithischia ("bird-hipped"; diverse bipedal and quadrupedal herbivores) - †Saphornithischia ("tr...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 57.0, 'file_path': 'dinosaur-page.html'}, score: 0.6415687799453735, embedding: vector of size 1536), Document(id=294a6ad7-9781-4bfb-9de4-5d1dc44d9c85, content: '- †Sauropodomorpha (herbivores with small heads, long necks, and long tails) - †Unaysauridae (primit...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 59.0, 'file_path': 'dinosaur-page.html'}, score: 0.6410965919494629, embedding: vector of size 1536), Document(id=0617d027-ae05-4321-921c-44b5dd2a18f3, content: '- †Alvarezsauroidea (small hunters with reduced forelimbs) - †Alvarezsauridae (insectivores with onl...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c

The chunk which has the answer is ranked 2 out of 3. Also the score of the top ranked chunk ~ 0.64, so not a very confident retrieval(?). What if we could give only one chunk as context?

In [19]:
retrieval_result = retriever.run(query_embedding=embedding, top_k=1)
print(retrieval_result)
retrieved_documents = retrieval_result["documents"]
for doc in retrieved_documents:
    print(doc.content)

{'documents': [Document(id=d060a7dc-17f6-43e6-9b3a-4365480a5543, content: '- †Ornithischia ("bird-hipped"; diverse bipedal and quadrupedal herbivores) - †Saphornithischia ("tr...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 57.0, 'file_path': 'dinosaur-page.html'}, score: 0.6415687799453735, embedding: vector of size 1536)]}
- †Ornithischia ("bird-hipped"; diverse bipedal and quadrupedal herbivores) - †Saphornithischia ("true" ornithischians) - †Heterodontosauridae (small herbivores/omnivores with prominent canine-like teeth) - †Genasauria ("cheeked lizards") - †Thyreophora (armored dinosaurs; bipeds and quadrupeds) - †Eurypoda (heavy, quadrupedal thyreophorans) - †Stegosauria (spikes and plates as primary armor) - †Huayangosauridae (small stegosaurs with flank osteoderms and tail clubs) - †Stegosauridae (large stegosaurs) - †Ankylosauria (scutes as primary armor) - †Parankylosauria (small, southern ankylosaurs with macuahuitl-li

The answer does not exist in the top ranked chunk. What does the LLM model say now?

In [20]:

context = "\n".join([doc.content for doc in retrieved_documents])

prompt = (
    f"Answer the question only using the following context. "
    f"Do not use any external information. If the answer is not present in the context, "
    f"please answer with \"I don't know\".\n\n"
    f"Question: {question}\n"
    f"Context:\n{context}\n"
    f"Answer:"
)
generation_result = generator.run(prompt=prompt)

generated_replies = generation_result["replies"]
for reply in generated_replies:
    print("Generated Reply:", reply)

Generated Reply: I don't know.


As expected, the LLM responds with "I don't know" as the answer does not exist in the context of the only chunk that was retrieved.

This is a good opportunity to test BM25 based retrieval.

In [21]:
from haystack_integrations.components.retrievers.weaviate.bm25_retriever import WeaviateBM25Retriever


bm25retriever = WeaviateBM25Retriever(document_store=document_store)

In [22]:
bm25_retrieval_result = bm25retriever.run(query=question, top_k=1)
print(bm25_retrieval_result)
bm25_retrieved_documents = bm25_retrieval_result["documents"]
for doc in bm25_retrieved_documents:
    print(doc.content)

{'documents': [Document(id=a0ec45c3-c0cd-4604-a176-594456f11b36, content: 'Scholarly descriptions of what would now be recognized as dinosaur bones first appeared in the late ...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 23.0, 'file_path': 'dinosaur-page.html'}, score: 1.9453471899032593, embedding: vector of size 1536)]}
Scholarly descriptions of what would now be recognized as dinosaur bones first appeared in the late 17th century in England. Part of a bone, now known to have been the femur of a Megalosaurus , was recovered from a limestone quarry at Cornwell near Chipping Norton, Oxfordshire, in 1676. The fragment was sent to Robert Plot, Professor of Chemistry at the University of Oxford and first curator of the Ashmolean Museum, who published a description in his The Natural History of Oxford-shire (1677). He correctly identified the bone as the lower extremity of the femur of a large animal, and recognized that it was to

In [23]:
question

'What is Averostra?'

Unsatisfactory! Expected BM25 based embedding to return the proper chunk which had the term 'Averostra'.

In [24]:
bm25_retrieval_result = bm25retriever.run(query="What is MRCA", top_k=1)
print(bm25_retrieval_result)
bm25_retrieved_documents = bm25_retrieval_result["documents"]
for doc in bm25_retrieved_documents:
    print(doc.content)

{'documents': [Document(id=7ea8178b-0ad5-4b37-80bf-aa925decf8bb, content: 'Under phylogenetic nomenclature, dinosaurs are usually defined as the group consisting of the most r...', meta: {'split_id': 5.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 2.5518620014190674, embedding: vector of size 1536)]}
Under phylogenetic nomenclature, dinosaurs are usually defined as the group consisting of the most recent common ancestor (MRCA) of Triceratops and modern birds (Neornithes), and all its descendants. It has also been suggested that Dinosauria be defined with respect to the MRCA of Megalosaurus and Iguanodon , because these were two of the three genera cited by Richard Owen when he recognized the Dinosauria. Both definitions cover the same known genera: Dinosauria = Ornithischia + Saurischia. This includes major groups such as ankylosaurians (armored herbivorous quadrupeds), stegosaurians (plated herbivorous qu

But BM25 is able to find the term 'MRCA' as expected. So mixed results!

Next, let's check if 'MRCA' is found in single chunk vector embedding retrieval.

In [25]:
from haystack import Document

question = "What is MRCA"

document = Document(content=question)
result = embedder.run(documents=[document])
embedded_document = result["documents"][0]
embedding = embedded_document.embedding

retrieval_result = retriever.run(query_embedding=embedding, top_k=1)
print(retrieval_result)
retrieved_documents = retrieval_result["documents"]
for doc in retrieved_documents:
    print(doc.content)

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.57it/s]

{'documents': [Document(id=7ea8178b-0ad5-4b37-80bf-aa925decf8bb, content: 'Under phylogenetic nomenclature, dinosaurs are usually defined as the group consisting of the most r...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 5.0, 'file_path': 'dinosaur-page.html'}, score: 0.6315898895263672, embedding: vector of size 1536)]}
Under phylogenetic nomenclature, dinosaurs are usually defined as the group consisting of the most recent common ancestor (MRCA) of Triceratops and modern birds (Neornithes), and all its descendants. It has also been suggested that Dinosauria be defined with respect to the MRCA of Megalosaurus and Iguanodon , because these were two of the three genera cited by Richard Owen when he recognized the Dinosauria. Both definitions cover the same known genera: Dinosauria = Ornithischia + Saurischia. This includes major groups such as ankylosaurians (armored herbivorous quadrupeds), stegosaurians (plated herbivorous qu




Weaviate vector embedding is also able to find the term MRCA in the right chunk.


Results are mixed
- Both vector embedding retrieval and BM25 were **unable** to find the correct chunk for 'What is Averostra?'
- Both vector embedding retrieval and BM25 were **able** to find the correct chunk for 'What is MRCA'

**Exp-4** - Run more tests to check if BM25 retriever would add value

Q - What is Mamenchisaurus?

The term 'Mamenchisaurus' appears only once in the text in a chunk which is reasonably long.

BM25

In [28]:
bm25_retrieval_result = bm25retriever.run(query="What is Mamenchisaurus?", top_k=1)
print(bm25_retrieval_result)
bm25_retrieved_documents = bm25_retrieval_result["documents"]
for doc in bm25_retrieved_documents:
    print(doc.content)

{'documents': [Document(id=a0ec45c3-c0cd-4604-a176-594456f11b36, content: 'Scholarly descriptions of what would now be recognized as dinosaur bones first appeared in the late ...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 23.0, 'file_path': 'dinosaur-page.html'}, score: 1.9453471899032593, embedding: vector of size 1536)]}
Scholarly descriptions of what would now be recognized as dinosaur bones first appeared in the late 17th century in England. Part of a bone, now known to have been the femur of a Megalosaurus , was recovered from a limestone quarry at Cornwell near Chipping Norton, Oxfordshire, in 1676. The fragment was sent to Robert Plot, Professor of Chemistry at the University of Oxford and first curator of the Ashmolean Museum, who published a description in his The Natural History of Oxford-shire (1677). He correctly identified the bone as the lower extremity of the femur of a large animal, and recognized that it was to

Vector embedding

In [29]:
from haystack import Document

question = "What is Mamenchisaurus?"

document = Document(content=question)
result = embedder.run(documents=[document])
embedded_document = result["documents"][0]
embedding = embedded_document.embedding

retrieval_result = retriever.run(query_embedding=embedding, top_k=1)
print(retrieval_result)
retrieved_documents = retrieval_result["documents"]
for doc in retrieved_documents:
    print(doc.content)

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]

{'documents': [Document(id=c23833b2-e5bc-42c8-8a9d-7a5ffdf9b33a, content: 'Less well-preserved remains of the sauropodomorphs Jaklapallisaurus and Nambalia , along with the ea...', meta: {'split_id': 41.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.7286020517349243, embedding: vector of size 1536)]}
Less well-preserved remains of the sauropodomorphs Jaklapallisaurus and Nambalia , along with the early saurischian Alwalkeria , are known from the Upper Maleri and Lower Maleri Formations of India. The Carnian-aged Chañares Formation of Argentina preserves primitive, dinosaur-like ornithodirans such as Lagosuchus and Lagerpeton in Argentina, making it another important site for understanding dinosaur evolution. These ornithodirans support the model of early dinosaurs as small, bipedal predators. Dinosaurs may have appeared as early as the Anisian epoch of the Triassic, approximately 243 million years ago, wh




Both vector embedding retriever and BM25 from Weaviate fail to fetch the right chunk.

The hypothesis for these experiments was that BM25 would perform better than vector embedding when the question is short, focussed on just a single term and is not semantically heavy. The expectation was that BM25 would have been able to search the unique terms like 'Mamenchisaurus' or 'Averostra' using BM25 as these terms are uncommon and inverted text based index should perform well here.

Two further questions:
1. Are there more nuances involved in successfully retrieving BM25 based chunks from Weaviate.  
    - Need to delve into Weaviate documentation - concepts, config options etc.
2. Will pure inverted text based solutions like Elasticsearch perform better than Weaviate BM25 (which is primarily a vector db)