In [1]:
from haystack.pipeline import Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf"])

text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()

document_splitter = DocumentSplitter(split_by="word", split_length=200, split_overlap=50)
document_cleaner = DocumentCleaner(remove_empty_lines=True, remove_extra_whitespaces=True, remove_repeated_substrings=False)
document_joiner = DocumentJoiner(join_mode="concatenate")

document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store)

In [2]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component("file_type_router", file_type_router)
preprocessing_pipeline.add_component("text_file_converter", text_file_converter)
preprocessing_pipeline.add_component("markdown_converter", markdown_converter)
preprocessing_pipeline.add_component("pdf_converter", pdf_converter)
preprocessing_pipeline.add_component("document_splitter", document_splitter)
preprocessing_pipeline.add_component("document_cleaner", document_cleaner)
preprocessing_pipeline.add_component("document_joiner", document_joiner)
preprocessing_pipeline.add_component("document_embedder", document_embedder)
preprocessing_pipeline.add_component("document_writer", document_writer)

In [3]:
preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pdf_converter.sources")
preprocessing_pipeline.connect("file_type_router.unclassified", "markdown_converter.sources")
preprocessing_pipeline.connect("text_file_converter.documents", "document_joiner.documents")
preprocessing_pipeline.connect("pdf_converter.documents", "document_joiner.documents")
preprocessing_pipeline.connect("markdown_converter.documents", "document_joiner.documents")
preprocessing_pipeline.connect("document_joiner.documents", "document_cleaner.documents")
preprocessing_pipeline.connect("document_cleaner.documents", "document_splitter.documents")
preprocessing_pipeline.connect("document_splitter.documents", "document_embedder.documents")
preprocessing_pipeline.connect("document_embedder.documents", "document_writer.documents")

In [4]:
preprocessing_pipeline.draw("preprocessing_pipeline.png")

In [6]:
preprocessing_pipeline.run({"file_type_router": {"sources": ["winter_report_one.txt", "winter_report_two.pdf", "winter_report_three.md",]}})

Converting markdown files to Documents: 100%|██████████| 1/1 [00:00<00:00, 1048.31it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 9}}

In [7]:
from getpass import getpass

api_key = getpass("OpenAI API Key: ")

In [9]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators.openai import OpenAIGenerator

template = """
You are a wise elf living in the forest with other elves.
You will be provided with some context from Elves' yearly winter reports.
Answer the questions from other elves based on the given context as if you are an elf as well.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ question }}
Answer:
"""
pipe = Pipeline()
pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
pipe.add_component("prompt_builder", PromptBuilder(template=template))
pipe.add_component("llm", OpenAIGenerator(api_key=api_key))
pipe.connect("embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

In [10]:
query = "What should we do against water scarcity?"
# query = "Give me one example of nice moment they we had in past winters"
# query = "Which foods should we collect?"

pipe.run({
    "embedder": {"text": query},
    "prompt_builder": {
        "question": query
    }
})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'llm': {'replies': ['In response to water scarcity, we Elves took proactive measures to ensure a stable water supply. We gathered snow and stored it for future use as drinking water. This helped us preserve our water resources during times of uncertainty. Therefore, I suggest that we continue collecting and storing snow to ensure we have enough water throughout the year. Additionally, we can explore other methods of conserving water, such as using it efficiently and being mindful of our usage.'],
  'meta': [{'model': 'gpt-3.5-turbo-0613',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completion_tokens': 88,
     'prompt_tokens': 1958,
     'total_tokens': 2046}}]}}