In [3]:
pip install chromadb langchain-chroma langchain-openai python-dotenv --quiet

In [4]:
import os
import chromadb
from chromadb.utils import embedding_functions
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
import numpy as np

In [5]:
sentence_list = [
    "Meta drops multimodal Llama 3.2 — here's why it's such a big deal",
    "Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models",
    "Google is bringing Gemini to all older Pixel Buds",
    "The first Intel Battlmage GPU benchmarks have leaked",
    "Dell partners with Nvidia to accelerate AI adoption in telecoms",
]
ids = ["id1", "id2", "id3", "id4", "id5"]

**Creating a collection**

In [6]:
chroma_client = chromadb.Client()

In [7]:
# To persist in disk, use:
# chroma_client = chromadb.PersistentClient(path="chromadb/")

In [8]:
collection = chroma_client.create_collection(name="udacity")

In [9]:
# By default, Chroma uses the Sentence Transformers all-MiniLM-L6-v2
# model to create embeddings.
collection.add(
    documents=sentence_list,
    ids=ids
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 48.2MiB/s]


In [10]:
collection._embedding_function

<chromadb.utils.embedding_functions.DefaultEmbeddingFunction at 0x7943905ed650>

In [11]:
collection.count()

5

In [12]:
collection.peek(2)

{'ids': ['id1', 'id2'],
 'embeddings': array([[ 6.06655143e-02, -3.51322778e-02,  6.06436618e-02,
         -5.11926189e-02,  1.13580175e-01, -1.88892670e-02,
         -2.68528406e-02,  5.48633598e-02,  3.23644355e-02,
          5.42442687e-02, -4.04198617e-02, -1.90558787e-02,
         -5.97919673e-02,  2.56031975e-02,  8.48459899e-02,
          4.12196591e-02,  3.95206511e-02, -4.00091261e-02,
         -7.66606331e-02,  2.78291814e-02,  5.38355038e-02,
         -1.35247614e-02,  9.65649858e-02, -3.04361209e-02,
          6.61457935e-03,  7.21731111e-02, -9.53866243e-02,
         -2.75959149e-02,  7.86794722e-03, -6.68520033e-02,
         -1.27341738e-02,  1.21337980e-01, -6.66138455e-02,
         -3.28670703e-02, -6.49284497e-02, -1.61902495e-02,
         -3.32964119e-03,  8.04081038e-02, -3.84503826e-02,
          1.37262192e-04,  3.72601603e-03,  4.83831093e-02,
         -3.68634346e-06, -4.51370478e-02, -1.37449540e-02,
         -7.15254843e-02,  1.01805590e-02, -4.23029736e-02,
  

In [13]:
collection.query(
    query_texts=["gadget"],
    n_results=2,
    include=['metadatas', 'documents', 'distances']
)

{'ids': [['id3', 'id1']],
 'embeddings': None,
 'documents': [['Google is bringing Gemini to all older Pixel Buds',
   "Meta drops multimodal Llama 3.2 — here's why it's such a big deal"]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.5251753330230713, 1.7548508644104004]]}

**Choosing other models**

In [14]:
embeddings_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-mpnet-base-v2"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
embeddings = embeddings_fn(sentence_list)
len(embeddings)

5

In [16]:
print(np.dot(embeddings[1], embeddings[4]))
print(sentence_list[1])
print(sentence_list[4])

0.5583154
Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models
Dell partners with Nvidia to accelerate AI adoption in telecoms


In [17]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
OPENAI_API_BASE = "https://openai.vocareum.com/v1"
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
TAVILY_API_KEY = userdata.get('TAVILY_API_KEY')

In [19]:
from dotenv import load_dotenv
load_dotenv()

embeddings_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_base=OPENAI_API_BASE,
    api_key=OPENAI_API_KEY
)

In [21]:
embeddings_fn.model_name

'text-embedding-ada-002'

In [22]:
chroma_client.delete_collection(name="udacity")

collection = chroma_client.create_collection(
    name="udacity",
    embedding_function=embeddings_fn
)

In [23]:
collection.add(
    documents=sentence_list,
    ids=ids
)

In [24]:
collection._embedding_function

<chromadb.utils.embedding_functions.openai_embedding_function.OpenAIEmbeddingFunction at 0x7942213d07d0>

In [25]:
collection.query(
    query_texts=["gadget"],
    n_results=2,
    include=['metadatas', 'documents', 'distances']
)

{'ids': [['id3', 'id4']],
 'embeddings': None,
 'documents': [['Google is bringing Gemini to all older Pixel Buds',
   'The first Intel Battlmage GPU benchmarks have leaked']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[0.46601054072380066, 0.48678600788116455]]}

**Using with LangChain**

In [27]:
chroma_client.delete_collection(name="udacity")

In [28]:
from dotenv import load_dotenv
load_dotenv()

vector_store = Chroma(
    collection_name="udacity",
    embedding_function=OpenAIEmbeddings(api_key=OPENAI_API_KEY, base_url=OPENAI_API_BASE),
)

In [29]:
documents = [
    Document(
        page_content="Meta drops multimodal Llama 3.2 — here's why it's such a big deal",
        metadata={"company":"Meta", "topic": "llama"}
    ),
    Document(
        page_content="Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models",
        metadata={"company":"Nvidia", "topic": "acquisition"}
    ),
    Document(
        page_content="Google is bringing Gemini to all older Pixel Buds",
        metadata={"company":"Google", "topic": "gemini"}
    ),
    Document(
        page_content="The first Intel Battlmage GPU benchmarks have leaked",
        metadata={"company":"Intel", "topic": "gpu"}
    ),
    Document(
        page_content="Dell partners with Nvidia to accelerate AI adoption in telecoms",
        metadata={"company":"Dell", "topic": "partnership"}
    ),
]

In [30]:
vector_store.add_documents(documents=documents, ids=ids)

['id1', 'id2', 'id3', 'id4', 'id5']

In [31]:
results = vector_store.similarity_search_with_score(query="gpu",k=2)
for doc, score in results:
    print(f"-> {doc.page_content}\n   [Score={score:.2f}]\n   [{doc.metadata}]\n\n")

-> The first Intel Battlmage GPU benchmarks have leaked
   [Score=0.35]
   [{'company': 'Intel', 'topic': 'gpu'}]


-> Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models
   [Score=0.41]
   [{'topic': 'acquisition', 'company': 'Nvidia'}]


