In [1]:
import chromadb
client = chromadb.Client()

Using embedded DuckDB without persistence: data will be transient


In [2]:
import numpy as np
from tqdm import tqdm

In [3]:
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts: Documents) -> Embeddings:
        # embed the documents somehow
        return np.random.random((len(texts), 512)).tolist()


In [4]:

collection = client.create_collection(name="clip_image_product", embedding_function=MyEmbeddingFunction())

In [5]:
collection.count()

0

In [6]:
for chunk in tqdm(range(10000)):
    vectors = np.random.rand(100, 512)
    collection.add(
        documents=[f"This is a document id{idx+chunk*100}" for idx in range(100)],
        metadatas=[{"color": ["red", "yellow", "blue"][idx%3]} for idx in range(100)],
        ids=[f"id{idx+chunk*100}" for idx in range(100)],
        embeddings=vectors.tolist()
    )

100%|██████████| 10000/10000 [5:39:20<00:00,  2.04s/it]  


In [7]:
collection.count()

1000000

In [9]:
hits = collection.query(
    query_embeddings=np.random.random((1,512)).tolist(),
    n_results=1000
)

In [10]:
import psutil

# function to convert bytes to a more human-readable format
def convert_bytes(num):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return f"{num:.2f} {x}"
        num /= 1024.0

# get the current process's memory usage
process = psutil.Process()
memory_info = process.memory_info().rss

# print the memory usage in a human-readable format
print(f"Current memory usage: {convert_bytes(memory_info)}")

Current memory usage: 7.11 GB
