<a href="https://colab.research.google.com/github/jlonge4/gen_ai_utils/blob/main/chroma_haystack_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install haystack-ai chroma-haystack pypdf sentence_transformers

In [None]:
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PyPDFToDocument
from haystack.components.converters.txt import TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.joiners import DocumentJoiner
from haystack.utils import Secret
from pathlib import Path

from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator
import concurrent.futures
import os
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever, ChromaEmbeddingRetriever

In [None]:
document_store = ChromaDocumentStore(collection_name='mydocs', persist_path='/content/vec-index', distance_function='cosine')

In [None]:
def write_documents(file_path, document_store):
    """Convert and write the documents to the document store."""
    pipeline = Pipeline()
    pipeline.add_component("converter", PyPDFToDocument())
    pipeline.add_component("cleaner", DocumentCleaner())
    pipeline.add_component(
        "splitter", DocumentSplitter(split_by="word", split_length=50)
    )
    document_embedder = SentenceTransformersDocumentEmbedder()
    document_embedder.warm_up()
    pipeline.add_component(
        "embedder", document_embedder
    )
    pipeline.add_component("writer", DocumentWriter(document_store=document_store))

    pipeline.connect("converter", "cleaner")
    pipeline.connect("cleaner", "splitter")
    pipeline.connect("splitter", "embedder")
    pipeline.connect("embedder.documents", "writer")

    pipeline.run({"converter": {"sources": [Path(file_path)]}})

In [None]:
path = '/content/Release v1.24.0 · deepset-ai_haystack.pdf'
write_documents(path, document_store)
document_store.count_documents()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

7

In [None]:
def query_pipeline(query, document_store):
    """Query the pipeline for context using hybrid retrieval and reciprocal rank fusion."""
    query_pipeline = Pipeline()
    query_pipeline.add_component(
        "text_embedder", SentenceTransformersTextEmbedder()
    )
    query_pipeline.add_component(
        "retriever", ChromaEmbeddingRetriever(document_store=document_store, top_k=4)
    )

    query_pipeline.connect("text_embedder", "retriever")
    result = query_pipeline.run(
        {"text_embedder": {"text": query}}
    )
    return result["retriever"]["documents"]

In [None]:
results = query_pipeline("What is the issue number for AWS embedding models?", document_store)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
results[0].content

'1/25/24, 10:35 PM Release v1.24.0 · deepset-ai/haystack\nhttps://github.com/deepset-ai/haystack/releases/tag/v1.24.0 1/2Releases v1.24.0\nv1.24.0Latest github-actions released this 11 hours ago · 197 commits to main since this release v1.24.0\nRelease Not es\nHighlights\n🪨 Amazon Bedr ock suppor ts new embedding models ( #6406 )\nYou can now use Titan and Cohere embedding models in your pipelines via the '

In [None]:
del document_store

In [None]:
document_store = document_store = ChromaDocumentStore(collection_name='mydocs', persist_path='/content/vec-index', distance_function='cosine')

In [None]:
results = query_pipeline("What is the issue number for AWS embedding models?", document_store)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
results[0].content

'1/25/24, 10:35 PM Release v1.24.0 · deepset-ai/haystack\nhttps://github.com/deepset-ai/haystack/releases/tag/v1.24.0 1/2Releases v1.24.0\nv1.24.0Latest github-actions released this 11 hours ago · 197 commits to main since this release v1.24.0\nRelease Not es\nHighlights\n🪨 Amazon Bedr ock suppor ts new embedding models ( #6406 )\nYou can now use Titan and Cohere embedding models in your pipelines via the '