<h2 align='center'>Codebasics Gen AI Course: Getting Started with Chromadb</h2>

In [20]:
# pip install chromadb==0.5.3
# Specifically install the above version because the default version 0.5.23 gave a crash
# https://github.com/chroma-core/chroma/issues/2513

import chromadb

### Ephemeral (In Memory) Client

In [2]:
client = chromadb.Client()

In [21]:
# you can use delete collection to delete it just in case you have created the news collection
# already in the memory
# client.delete_collection("news")

In [6]:
collection = client.create_collection(name="news")

# you can get the collection directly as a return value of create_collection method or
# call client.get_collection("news") later on to get it
# collection = client.get_collection("news")

<chromadb.api.models.Collection.Collection at 0x21fe2d9e5f0>

In [8]:
collection.add(
    documents=[
        "Apple reported its quarterly earnings today.",
        "Apple has a lot of vitamin A" 
    ],
    ids=["id1", "id2"]
)

In [13]:
objects = collection.peek()
len(objects['embeddings'][0])

384

In [9]:
results = collection.query(
    query_texts=["New Iphone will launch in september"],
    n_results=2
)
results

{'ids': [['id1', 'id2']],
 'distances': [[1.3853877782821655, 1.7049835920333862]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['Apple reported its quarterly earnings today.',
   'Apple has a lot of vitamin A']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [10]:
results = collection.query(
    query_texts = ["My niece loves oranges"],
    n_results=2
)
results

{'ids': [['id2', 'id1']],
 'distances': [[1.4487004280090332, 1.8779758214950562]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['Apple has a lot of vitamin A',
   'Apple reported its quarterly earnings today.']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

### Persistent Client

In [14]:
clientp = chromadb.PersistentClient(path="./news_vector_db")

In [15]:
from chromadb.utils import embedding_functions

ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name='all-distilroberta-v1'
)

collection = clientp.create_collection(
    name="news_v2",
    embedding_function=ef
)

In [16]:
collection.add(
    documents = [
        "Apple reported its quarterly earnings today.",
        "Apple has a lot of vitamin A"
    ],
    ids = ["id1", "id2"]
)

In [17]:
results = collection.query(
    query_texts = ["New iPhone will launch this September"],
    n_results=2
)
results

{'ids': [['id1', 'id2']],
 'distances': [[1.3624796672368373, 1.6184432245866847]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['Apple reported its quarterly earnings today.',
   'Apple has a lot of vitamin A']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [18]:
clientp.heartbeat()

1735577497021421700

In [None]:
# client reset will empty and delete the database. By default you can not perform
# this operation. You need to set a specific environment variable to allow the reset 
# as it is a destructive operation
# client.reset()