Test connection to local ChromaDB instance

In [1]:
from dotenv import load_dotenv
import os
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

load_dotenv()
client = chromadb.PersistentClient(path="../components/vector_db")

In [2]:
client.heartbeat()

1766582549613596000

Embedder

In [3]:
voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction(
    api_key=os.getenv("VOYAGE_API_KEY"),  
    model_name="voyage-3-large"
)

  from .autonotebook import tqdm as notebook_tqdm


Collection

In [6]:
collection_name = "collection"

try:
    collection = client.create_collection(
        name=collection_name,
        embedding_function=voyageai_ef,
        configuration={
            "hnsw": {
                "space": "cosine",
                "ef_construction": 100
            }
        }
    )
    print(f"Collection {collection_name} created")
except Exception as e:
    collection = client.get_collection(
        name=collection_name
    )
    print(f"Collection {collection_name} already exists")    

Collection collection already exists


Test adding and querying

In [13]:
collection.add(
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges",
        "This is a document about dogs and cats",
        "Capital of France is Paris",
        "Capital of Germany is Berlin",
        "Capital of Italy is Rome",
        "Color of the sky is blue",
        "Color of the grass is green"        
    ],
    metadatas=[
        {"source": "pineapple", "user": "user1", "date": "2021-01-01"},
        {"source": "oranges", "user": "user2", "date": "2021-01-02"},
        {"source": "dogs and cats", "user": "user3", "date": "2021-01-03"},
        {"source": "countries", "user": "user1", "date": "2021-01-04"}, 
        {"source": "countries", "user": "user1", "date": "2021-01-04"}, 
        {"source": "countries", "user": "user1", "date": "2021-01-04"}, 
        {"source": "colors", "user": "user2", "date": "2021-01-05"},
        {"source": "colors", "user": "user2", "date": "2021-01-05"}
    ]
)   

In [16]:
collection.query(
    query_texts = ["Eiffel Tower"],
    n_results=5,
    include=["documents", "metadatas", "distances"] # "embeddings"
)

{'ids': [['id4', 'id6', 'id5', 'id7', 'id8']],
 'embeddings': None,
 'documents': [['Capital of France is Paris',
   'Capital of Italy is Rome',
   'Capital of Germany is Berlin',
   'Color of the sky is blue',
   'Color of the grass is green']],
 'uris': None,
 'included': ['documents', 'metadatas', 'distances'],
 'data': None,
 'metadatas': [[{'date': '2021-01-04', 'user': 'user1', 'source': 'countries'},
   {'source': 'countries', 'date': '2021-01-04', 'user': 'user1'},
   {'source': 'countries', 'user': 'user1', 'date': '2021-01-04'},
   {'source': 'colors', 'user': 'user2', 'date': '2021-01-05'},
   {'user': 'user2', 'date': '2021-01-05', 'source': 'colors'}]],
 'distances': [[0.36765164136886597,
   0.46660417318344116,
   0.501257061958313,
   0.5323166847229004,
   0.598609447479248]]}