In [1]:
import os
import chromadb
from chromadb.utils import embedding_functions
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
import numpy as np

In [2]:
sentence_list = [
    "Meta drops multimodal Llama 3.2 — here's why it's such a big deal",
    "Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models",
    "Google is bringing Gemini to all older Pixel Buds",
    "The first Intel Battlmage GPU benchmarks have leaked",
    "Dell partners with Nvidia to accelerate AI adoption in telecoms",
]
ids = ["id1", "id2", "id3", "id4", "id5"]

**Creating a collection**

In [3]:
chroma_client = chromadb.Client()

In [None]:
# To persist in disk, use:
# chroma_client = chromadb.PersistentClient(path="chromadb/")

In [4]:
collection = chroma_client.create_collection(name="udacity")

In [5]:
# By default, Chroma uses the Sentence Transformers all-MiniLM-L6-v2 
# model to create embeddings.
collection.add(
    documents=sentence_list,
    ids=ids
)

In [6]:
collection._embedding_function

<chromadb.utils.embedding_functions.onnx_mini_lm_l6_v2.ONNXMiniLM_L6_V2 at 0x12eba4150>

In [7]:
collection.count()

5

In [8]:
collection.peek(2)

{'ids': ['id1', 'id2'],
 'embeddings': array([[ 6.06747121e-02, -3.51287387e-02,  6.06430210e-02,
         -5.11958823e-02,  1.13577358e-01, -1.88812558e-02,
         -2.68563069e-02,  5.48521988e-02,  3.23705971e-02,
          5.42319790e-02, -4.04220782e-02, -1.90565046e-02,
         -5.98006099e-02,  2.56025922e-02,  8.48477483e-02,
          4.12208885e-02,  3.95124070e-02, -4.00038101e-02,
         -7.66580924e-02,  2.78269649e-02,  5.38381748e-02,
         -1.35235973e-02,  9.65652838e-02, -3.04286182e-02,
          6.62264926e-03,  7.21764490e-02, -9.53955278e-02,
         -2.75929421e-02,  7.86578842e-03, -6.68484047e-02,
         -1.27422102e-02,  1.21331684e-01, -6.66161552e-02,
         -3.28697823e-02, -6.49218261e-02, -1.61951613e-02,
         -3.33233248e-03,  8.04133341e-02, -3.84463109e-02,
          1.44819714e-04,  3.71691165e-03,  4.83801402e-02,
         -8.19774050e-06, -4.51294743e-02, -1.37413908e-02,
         -7.15169311e-02,  1.01871518e-02, -4.22974639e-02,
  

In [12]:
collection.query(
    query_texts=["gadget"],
    n_results=2,
    include=['metadatas', 'documents', 'distances']
)

{'ids': [['id3', 'id1']],
 'embeddings': None,
 'documents': [['Google is bringing Gemini to all older Pixel Buds',
   "Meta drops multimodal Llama 3.2 — here's why it's such a big deal"]],
 'uris': None,
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.5251840353012085, 1.7548894882202148]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

**Choosing other models**

In [13]:
embeddings_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-mpnet-base-v2"
)

  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
embeddings = embeddings_fn(sentence_list)
len(embeddings)

5

In [15]:
print(np.dot(embeddings[1], embeddings[4]))
print(sentence_list[1])
print(sentence_list[4])

0.5583155
Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models
Dell partners with Nvidia to accelerate AI adoption in telecoms


In [16]:
from dotenv import load_dotenv
load_dotenv()

embeddings_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [17]:
embeddings_fn._model_name

'text-embedding-ada-002'

In [18]:
chroma_client.delete_collection(name="udacity")

collection = chroma_client.create_collection(
    name="udacity",
    embedding_function=embeddings_fn
)

In [19]:
collection.add(
    documents=sentence_list,
    ids=ids
)

In [20]:
collection._embedding_function

<chromadb.utils.embedding_functions.openai_embedding_function.OpenAIEmbeddingFunction at 0x30ac93590>

In [22]:
collection.query(
    query_texts=["gadget"],
    n_results=2,
    include=['metadatas', 'documents', 'distances']
)

{'ids': [['id3', 'id4']],
 'embeddings': None,
 'documents': [['Google is bringing Gemini to all older Pixel Buds',
   'The first Intel Battlmage GPU benchmarks have leaked']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[0.46601054072380066, 0.48678597807884216]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

**Using with LangChain**

In [23]:
chroma_client.delete_collection(name="udacity")

In [24]:
from dotenv import load_dotenv
load_dotenv()

vector_store = Chroma(
    collection_name="udacity",
    embedding_function=OpenAIEmbeddings(),
)

In [25]:
documents = [
    Document(
        page_content="Meta drops multimodal Llama 3.2 — here's why it's such a big deal",
        metadata={"company":"Meta", "topic": "llama"}
    ),
    Document(
        page_content="Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models",
        metadata={"company":"Nvidia", "topic": "acquisition"}
    ),
    Document(
        page_content="Google is bringing Gemini to all older Pixel Buds",
        metadata={"company":"Google", "topic": "gemini"}
    ),
    Document(
        page_content="The first Intel Battlmage GPU benchmarks have leaked",
        metadata={"company":"Intel", "topic": "gpu"}
    ),
    Document(
        page_content="Dell partners with Nvidia to accelerate AI adoption in telecoms",
        metadata={"company":"Dell", "topic": "partnership"}
    ),
]

In [26]:
vector_store.add_documents(documents=documents, ids=ids)

['id1', 'id2', 'id3', 'id4', 'id5']

In [27]:
results = vector_store.similarity_search_with_score(query="gpu",k=2)
for doc, score in results:
    print(f"-> {doc.page_content}\n   [Score={score:.2f}]\n   [{doc.metadata}]\n\n")

-> The first Intel Battlmage GPU benchmarks have leaked
   [Score=0.35]
   [{'company': 'Intel', 'topic': 'gpu'}]


-> Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models
   [Score=0.41]
   [{'company': 'Nvidia', 'topic': 'acquisition'}]


