## Chroma DB

- Chroma is an AI-native open source vector database focused on developer productivity and happiness. Chroma is licensed under apache 2.0
- Need to install "chroma DB"

In [18]:
## Building a sample vector DB
## With langchain_chroma installation , no need to install chromaDB now separately

from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [19]:
loader = TextLoader("speech.txt")
data = loader.load()
data

[Document(metadata={'source': 'speech.txt'}, page_content='"The moonlight danced across the rippling water, casting silver shadows."\n"A forgotten book lay dust-covered on the attic shelf."\n"In the quiet of the forest, every sound seemed magnified."\n"She wore a smile like a secret, hinting at mysteries untold."\n"The scent of fresh bread filled the air as the bakery opened."\n"He found solace in the rhythmic ticking of the old grandfather clock."\n"The city streets buzzed with the hum of a thousand stories waiting to be told."\n"Among the stars, dreams seemed to shimmer just a little brighter."\n"An old photograph slipped from the pages of a long-abandoned journal."\n"The train whistle echoed through the valley, a distant call to adventure."')]

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100 , chunk_overlap=0)
splits = text_splitter.split_documents(data)
splits


[Document(metadata={'source': 'speech.txt'}, page_content='"The moonlight danced across the rippling water, casting silver shadows."'),
 Document(metadata={'source': 'speech.txt'}, page_content='"A forgotten book lay dust-covered on the attic shelf."'),
 Document(metadata={'source': 'speech.txt'}, page_content='"In the quiet of the forest, every sound seemed magnified."'),
 Document(metadata={'source': 'speech.txt'}, page_content='"She wore a smile like a secret, hinting at mysteries untold."'),
 Document(metadata={'source': 'speech.txt'}, page_content='"The scent of fresh bread filled the air as the bakery opened."'),
 Document(metadata={'source': 'speech.txt'}, page_content='"He found solace in the rhythmic ticking of the old grandfather clock."'),
 Document(metadata={'source': 'speech.txt'}, page_content='"The city streets buzzed with the hum of a thousand stories waiting to be told."'),
 Document(metadata={'source': 'speech.txt'}, page_content='"Among the stars, dreams seemed to sh

In [21]:
embeddings = OllamaEmbeddings()
vectordb = Chroma.from_documents(documents=splits,embedding=embeddings)
vectordb

<langchain_chroma.vectorstores.Chroma at 0x1d3adce90a0>

In [22]:
## Query from Chroma vectorstore DB 

query = "what happened with photograph?"
docs = vectordb.similarity_search(query=query)
docs[0].page_content

'"Among the stars, dreams seemed to shimmer just a little brighter."'

In [23]:
## Saving DB to disk
## Provide persist_directory parameter with location to store DB 

vectordb = Chroma.from_documents(documents=splits,embedding=embeddings,persist_directory="./chroma_db")


In [24]:
## Load from DIsk and query
db2 = Chroma(persist_directory="./chroma_db",embedding_function=embeddings)
docs = db2.similarity_search(query=query)
docs[0].page_content

'"Among the stars, dreams seemed to shimmer just a little brighter."'

In [25]:
## To Get similarity scrore by manhatten distance 

docs = db2.similarity_search_with_score(query)
docs

[(Document(metadata={'source': 'speech.txt'}, page_content='"Among the stars, dreams seemed to shimmer just a little brighter."'),
  9332.266779786962),
 (Document(metadata={'source': 'speech.txt'}, page_content='"Among the stars, dreams seemed to shimmer just a little brighter."'),
  9332.266779786962),
 (Document(metadata={'source': 'speech.txt'}, page_content='"Among the stars, dreams seemed to shimmer just a little brighter."'),
  9332.266779786962),
 (Document(metadata={'source': 'speech.txt'}, page_content='"A forgotten book lay dust-covered on the attic shelf."'),
  10191.86555179553)]

In [26]:
### Retriever Options 

retriever = vectordb.as_retriever()
retriever.invoke(query)[0]


Document(metadata={'source': 'speech.txt'}, page_content='"Among the stars, dreams seemed to shimmer just a little brighter."')