In [10]:
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore
from haystack_integrations.components.retrievers.weaviate.embedding_retriever import WeaviateEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator

embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
text_embedder = OpenAITextEmbedder(model="text-embedding-3-small")
document_store = WeaviateDocumentStore(url="http://localhost:8088")
retriever = WeaviateEmbeddingRetriever(document_store=document_store, top_k=3)

template = """
Answer the question only using the following context. Do not use any external information.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
prompt_builder = PromptBuilder(template=template)

generator = OpenAIGenerator(model="gpt-4o-mini")


## Manually embed query and fetch context

Embed

In [6]:
from haystack import Document

question = "Which is the largest known dinosaur?"
document = Document(content=question)

result = embedder.run(documents=[document])

embedded_document = result["documents"][0]
embedding = embedded_document.embedding

print("Embedding:", embedding)


Calculating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.20s/it]

Embedding: [0.04768488556146622, 0.006433064583688974, 0.022633839398622513, 0.006872556637972593, -0.06908814609050751, 0.050146039575338364, -0.015678878873586655, -0.017777452245354652, -0.0470256470143795, 0.004323503002524376, 0.02931411936879158, -0.015547030605375767, -0.026127802208065987, -0.044960033148527145, -0.010223682969808578, 0.0191508661955595, 0.014250528998672962, 0.000751943385694176, -0.04003772512078285, -0.005133816506713629, 0.0048289187252521515, -0.030368899926543236, 0.011866284534335136, -0.013448456302285194, 0.07967990636825562, -0.01917283982038498, -0.027622073888778687, -0.01485483068972826, 0.006246280390769243, -0.006735215429216623, 0.0011152110528200865, -0.03153355419635773, -0.011976158246397972, 0.030544696375727654, -0.06715438514947891, -0.019436534494161606, -0.006790151819586754, 0.036565735936164856, -0.022699763998389244, -0.02076599933207035, 0.06420978903770447, -0.033445343375205994, -0.020963769406080246, 0.026127802208065987, 0.012097




Retrieve

In [8]:
retrieval_result = retriever.run(query_embedding=embedding)

print(retrieval_result)

retrieved_documents = retrieval_result["documents"]

for doc in retrieved_documents:
    print(doc.content)

{'documents': [Document(id=7f03ef58-8b84-4dd0-aa6c-76f6ada9cb34, content: 'There were larger dinosaurs, but knowledge of them is based entirely on a small number of fragmentar...', meta: {'split_id': 71.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.8639757037162781, embedding: vector of size 1536), Document(id=3f987a96-9b33-4af7-9698-4d834cdf525b, content: 'There were larger dinosaurs, but knowledge of them is based entirely on a small number of fragmentar...', meta: {'split_id': 72.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 0.8639573454856873, embedding: vector of size 1536), Document(id=35135aef-08b6-4e5f-ac04-6fbdb480d0db, content: 'The largest carnivorous dinosaur was Spinosaurus , reaching a length of 12.6 to 18 meters (41 to 59 ...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c

> **Warning:** There is an issue with embeddings. Two chunks with the same content exist. Investigate!

Generate

In [9]:
context = "\n".join([doc.content for doc in retrieved_documents])

prompt = f"Question: {question}\nContext: {context}\nAnswer:"

generation_result = generator.run(prompt=prompt)

generated_replies = generation_result["replies"]
for reply in generated_replies:
    print("Generated Reply:", reply)

Generated Reply: The largest known dinosaur is likely Bruhathkayosaurus, which recent estimates suggest could reach lengths of up to 44 meters (144 feet) and weigh between 110,000 to 170,000 kilograms (240,000 to 370,000 pounds). If these estimates hold true, Bruhathkayosaurus would rival the blue whale and Perucetus colossus as one of the largest animals to have ever existed. Other contenders for the title of largest dinosaur include Argentinosaurus, Amphicoelias (now referred to as Maraapunisaurus), and Patagotitan.


In [11]:
from haystack import Pipeline

query_pipeline = Pipeline()

query_pipeline.add_component("text_embedder", text_embedder)
query_pipeline.add_component("retriever", retriever)
query_pipeline.add_component("prompt_builder", prompt_builder)
query_pipeline.add_component("llm", generator)

query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever", "prompt_builder.documents")
query_pipeline.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7506c2b67c50>
🚅 Components
  - text_embedder: OpenAITextEmbedder
  - retriever: WeaviateEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [12]:
question = "Why are dinosaurs so fascinating?"

response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])


  timestamp = datetime.utcnow().replace(tzinfo=tzutc())


Dinosaurs are fascinating due to their fantastic appearance and often enormous size, which capture the popular imagination. Their regular appearances in best-selling books and films, along with the persistent public enthusiasm that generates significant funding for dinosaur science, contribute to their enduring cultural importance. The term "dinosaur" itself has entered common vernacular to describe anything impractically large, obsolete, or bound for extinction, further reflecting their impact on human culture.
