In [1]:
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter

document_store = InMemoryDocumentStore()

fetcher = LinkContentFetcher()
converter = HTMLToDocument()
splitter = DocumentSplitter(split_length=100, split_overlap=5)
embedder = SentenceTransformersDocumentEmbedder()
writer = DocumentWriter(document_store=document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("fetcher", fetcher)
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", writer)

indexing_pipeline.connect("fetcher", "converter")
indexing_pipeline.connect("converter", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")

In [2]:
indexing_pipeline.run(data={"fetcher":{"urls": ["https://docs.haystack.deepset.ai/v2.0/docs/sentencetransformerstextembedder", "https://docs.haystack.deepset.ai/v2.0/docs/openaidocumentembedder"]}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'writer': {'documents_written': 5}}

In [3]:
from getpass import getpass

api_key = getpass("Enter OpenAI Api key: ")

In [23]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators.openai import OpenAIGenerator

prompt_template = """
According to these documents:

{% for document in documents %}
    {{ document.content }} URL: {{ document.meta["url"] }} \n
{% endfor %}

Answer the given query: {{ query }}
Answer:
"""
prompt_builder = PromptBuilder(prompt_template)

embedder = SentenceTransformersTextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=2)
generator = OpenAIGenerator(api_key=api_key)

In [24]:
pipeline = Pipeline()
pipeline.add_component("embedder", embedder)
pipeline.add_component("retriever", retriever)
pipeline.add_component("prompt_builder", prompt_builder)
pipeline.add_component("generator", generator)

pipeline.connect("embedder.embedding", "retriever.query_embedding")
pipeline.connect("retriever.documents", "prompt_builder.documents")
pipeline.connect("prompt_builder.prompt", "generator.prompt")

In [25]:
query = "How do I use the openai embedder?"
result = pipeline.run(data={"embedder": {"text": query}, "prompt_builder": {"query": query}})
print(result["generator"]["replies"][0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

To use the OpenAIDocumentEmbedder, you can follow these steps:

1. Install the necessary libraries and dependencies.
2. Set your OpenAI API key as an environment variable for authentication.
3. Create a list of Documents that you want to embed.
4. Instantiate the OpenAIDocumentEmbedder component.
5. Use the `embed` method of the component to compute the embeddings of the Documents.
6. The obtained embeddings will be stored in the `embedding` field of each Document.
7. You can also add metadata to the Documents to improve retrieval.
8. Optionally, you can specify a different OpenAI embedding model by referring to the OpenAI documentation for compatible models.
9. When embedding a single string instead of a list of Documents, you should use the OpenAITextEmbedder.

For more detailed information and code examples, you can refer to the official documentation of OpenAIDocumentEmbedder at the following URL: [OpenAIDocumentEmbedder Documentation](https://docs.haystack.deepset.ai/v2.0/docs/ope