ChromaDB

In [1]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.docstore.document import Document

In [2]:
loader = TextLoader('speech.txt')
doc = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 256, chunk_overlap = 32)
chunked_doc = text_splitter.split_documents(doc)

In [4]:
embedding = OllamaEmbeddings()
db = Chroma.from_documents(chunked_doc, embedding)
db

<langchain_chroma.vectorstores.Chroma at 0x2b70c825dd0>

In [5]:
query = "How does the speaker describe the desired outcome of the war?"
search_result = db.similarity_search(query)
search_result

[Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors'),
 Document(metadata={'source': 'speech.txt'}, page_content='no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={'source': 'speech.txt'}, page_content='for the principles that gave her birth and happiness and the peace which she has treasured. God helping her, she can do no other.'),
 Document(metadata={'source': 'speech.txt'}, page_content='fact loyal to their neighbors and to the government in the hour of test. They are, most of them, as true and loyal Americans 

In [6]:
search_result[0].page_content

'to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors'

As a retriever

In [7]:
retriever = db.as_retriever()
retriever_result = retriever.invoke(query)
retriever_result

[Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors'),
 Document(metadata={'source': 'speech.txt'}, page_content='no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={'source': 'speech.txt'}, page_content='for the principles that gave her birth and happiness and the peace which she has treasured. God helping her, she can do no other.'),
 Document(metadata={'source': 'speech.txt'}, page_content='fact loyal to their neighbors and to the government in the hour of test. They are, most of them, as true and loyal Americans 

Add Doc into ChromaDB

In [8]:
doc_to_add = ["A transformer is a deep learning architecture developed by researchers at Google and based on the multi-head attention mechanism, proposed in a 2017 paper [Attention Is All You Need].",
              "BERT, GPT are both based on transformer architecture"]
doc_to_add = [Document(page_content=doc, metadata={}) for doc in doc_to_add]
db.add_documents(doc_to_add)

['92934650-10c5-4747-a287-82e5ab94f633',
 '6ca984fb-ff31-443e-9dec-3bc269a6e354']

In [9]:
query2 = "What technique is GPT based on?"
search_result2 = db.similarity_search(query2)
search_result2

[Document(metadata={}, page_content='A transformer is a deep learning architecture developed by researchers at Google and based on the multi-head attention mechanism, proposed in a 2017 paper [Attention Is All You Need].'),
 Document(metadata={}, page_content='BERT, GPT are both based on transformer architecture'),
 Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors'),
 Document(metadata={'source': 'speech.txt'}, page_content='no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.')]

In [10]:
distance2 = db.similarity_search_with_score(query2)
distance2

[(Document(metadata={}, page_content='A transformer is a deep learning architecture developed by researchers at Google and based on the multi-head attention mechanism, proposed in a 2017 paper [Attention Is All You Need].'),
  15013.22265625),
 (Document(metadata={}, page_content='BERT, GPT are both based on transformer architecture'),
  16415.634765625),
 (Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors'),
  18973.54296875),
 (Document(metadata={'source': 'speech.txt'}, page_content='no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'

Save and Load ChromaDB

In [11]:
db = Chroma.from_documents(chunked_doc, embedding, persist_directory = "./speech_chromaDB") #chromadb can only saved while init chromadb object

In [12]:
db2 = Chroma(embedding_function = embedding, persist_directory = "./speech_chromaDB")

In [13]:
search_result1 = db2.similarity_search(query)
search_result1

[Document(metadata={}, page_content='A transformer is a deep learning architecture developed by researchers at Google and based on the multi-head attention mechanism, proposed in a 2017 paper [Attention Is All You Need].'),
 Document(metadata={}, page_content='BERT, GPT are both based on transformer architecture'),
 Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors'),
 Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors')]

In [14]:
query2 = "What technique is GPT based on?"
search_result2 = db2.similarity_search(query2)
search_result2

[Document(metadata={}, page_content='A transformer is a deep learning architecture developed by researchers at Google and based on the multi-head attention mechanism, proposed in a 2017 paper [Attention Is All You Need].'),
 Document(metadata={}, page_content='BERT, GPT are both based on transformer architecture'),
 Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors'),
 Document(metadata={'source': 'speech.txt'}, page_content='to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors')]