# Chroma DB

In [1]:
import chromadb
chroma_client = chromadb.Client()


In [2]:
collection = chroma_client.create_collection(name="new_collection")


# Generate a embedding

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [5]:
emb1 = embeddings.embed_query("This is a document about pineapple")
emb2 = embeddings.embed_query("This is a document about oranges")

In [6]:
collection.add(
    embeddings=[emb1, emb2],
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    ids=["id1", "id2"]
)


In [7]:
emb_query = embeddings.embed_query("This is a query document about hawaii")

results = collection.query(
    query_embeddings=emb_query,
    query_texts=["This is a query document about hawaii"],
    n_results=2
)
print(results)


{'ids': [['id1', 'id2']], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'data': None, 'metadatas': [[None, None]], 'distances': [[0.2704945206642151, 0.3485858142375946]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


# Let's persist the DB

In [8]:
import chromadb

client = chromadb.PersistentClient(path="chroma")


In [9]:
collection = chroma_client.get_or_create_collection(name="facts-2")


In [10]:
collection.count()

0

In [11]:
statements = [
    "The sun rises in the east.", "The sun appears from the eastern horizon in the morning.",
    "Water boils at 100°C at sea level.", "At sea level, water reaches its boiling point at 100 degrees Celsius.",
    "Dogs are loyal animals.", "Canines are known for their loyalty.",
    "Paris is the capital of France.", "The capital city of France is Paris.",
    "Exercising regularly improves health.", "Regular physical activity benefits overall well-being."
]

In [12]:
metadatas = [{"len": len(s)} for s in statements]

In [13]:
embds = embeddings.embed_documents(texts=statements)

In [14]:
import uuid

collection.upsert(
    embeddings=embds,
    documents=statements,
    ids=[str(uuid.uuid4()) for _ in range(len(statements))],
    metadatas=metadatas,
)

In [15]:
emb_query = embeddings.embed_query("countries")

results = collection.query(
    query_embeddings=emb_query,
    query_texts=["country"],
    n_results=2
)
print(results)

{'ids': [['2aff2f09-8ff3-454a-a23f-dc249a290443', 'a076120b-e9c6-4080-907a-31c6d1faec4f']], 'embeddings': None, 'documents': [['The capital city of France is Paris.', 'Paris is the capital of France.']], 'uris': None, 'data': None, 'metadatas': [[{'len': 36}, {'len': 31}]], 'distances': [[0.4284917712211609, 0.4303634762763977]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [16]:
filter = {"$and": [{"len": {"$gte": 32}}, {"len": {"$lte": 40}}]}

In [17]:
results = collection.query(
    query_embeddings=emb_query,
    query_texts=["country"],
    n_results=1,
    where=filter,
)
print(results)

{'ids': [['2aff2f09-8ff3-454a-a23f-dc249a290443']], 'embeddings': None, 'documents': [['The capital city of France is Paris.']], 'uris': None, 'data': None, 'metadatas': [[{'len': 36}]], 'distances': [[0.4284917712211609]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


# Let's use langchain

In [18]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [19]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)

In [20]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['45f6d652-cd5b-4502-aabb-047ae70e15d2',
 '2e7e0455-84c2-4bbf-8cc5-4869bb947d35',
 'f2ca43a7-b81d-4933-a51c-24d1a6e7b7d0',
 '1df3c5ae-8047-474e-ac89-5e8791f85dc6',
 '03b1c3ac-25f0-440e-ab2b-335c69ca6c3f',
 'bfff8f5b-b529-4026-b303-6327c5ca01ea',
 'c57a93da-024d-4c65-82ac-cc3ad7c2abb2',
 '2f4fc9bd-e131-4368-8117-97d1de0ad4e9',
 '7ebe2428-9aae-44f7-91ae-0e4defc5eac1',
 '87b17923-3e4a-4577-a1e4-0ae5e4a3f5b8']

In [21]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]
