# Document Question Answering with local persistence

An example of using Chroma DB and LangChain to do question answering over documents, with a locally persisted database. 
You can store embeddings and documents, then use them again later.

In [1]:
from dotenv import load_dotenv 

# Load the environment variables from .env
load_dotenv()

True

In [2]:
#Text Loader
from langchain.document_loaders import TextLoader

#Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter


#Embeddings model Hugging Face Transformers
from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings #Ejecución local
#from langchain_community.embeddings import HuggingFaceHubEmbeddings #Legacy
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings #Ejecución en servidores HuggingFace

#vector store
from langchain_chroma import Chroma



## Load and process documents

Load documents to do question answering over. If you want to do this over your documents, this is the section you should replace.

Next we split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

In [3]:
# Load and process the text
loader = TextLoader('data/state_of_the_union.txt',encoding='UTF-8')
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100, )
texts = text_splitter.split_documents(documents)
len(texts)

95

In [4]:
print("0",texts[0].page_content, len(texts[0].page_content))
print("1",texts[1].page_content, len(texts[1].page_content))
print(texts[2])

0 Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  

Last year COVID-19 kept us apart. This year we are finally together again. 

Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. 

With a duty to one another to the American people to the Constitution. 

And with an unwavering resolve that freedom will always triumph over tyranny. 490
1 And with an unwavering resolve that freedom will always triumph over tyranny. 

Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. 

He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. 

He met the Ukrainian people. 403
page_content='He met the Ukrainian people. 

From President Zelenskyy to every Ukrainian, the

In [5]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, )
texts = text_splitter.split_documents(documents)
len(texts)

96

In [6]:
for i in range( 5):
    print(i,texts[i].page_content, len(texts[i].page_content))


0 Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  

Last year COVID-19 kept us apart. This year we are finally together again. 

Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. 

With a duty to one another to the American people to the Constitution. 

And with an unwavering resolve that freedom will always triumph over tyranny. 490
1 And with an unwavering resolve that freedom will always triumph over tyranny. 

Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. 

He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. 

He met the Ukrainian people. 403
2 He met the Ukrainian people. 

From President Zelenskyy to every Ukrainian, their fearlessn

## Initialize PeristedChromaDB

Create embeddings for each chunk and insert into the Chroma vector database. The `persist_directory` argument tells ChromaDB where to store the database when it's persisted. 

In [9]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'ChromaDB'
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding = HuggingFaceEndpointEmbeddings(model=model_name)
vectordb = Chroma.from_documents(documents=texts[0:30], embedding=embedding, persist_directory=persist_directory)

In [None]:
vectordb.similarity_search("How can we react?")


In [None]:
vectordb.max_marginal_relevance_search("How can we react?")

In [None]:
vectordb.add_documents(texts[31:60])

### Max tokens by vector

In [10]:
from sentence_transformers import SentenceTransformer
embeding_model : SentenceTransformer = SentenceTransformer(model_name)




In [11]:
embeding_model.get_max_seq_length()

384

In [12]:
embeding_model.tokenizer

MPNetTokenizerFast(name_or_path='sentence-transformers/all-mpnet-base-v2', vocab_size=30527, model_max_length=384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	104: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30526: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, norm

In [16]:
from transformers import MPNetTokenizer

tokenizer = MPNetTokenizer.from_pretrained(model_name)



In [20]:
tokens=tokenizer.tokenize(texts[0].page_content)
print(len(tokens),tokens[:30])

96 ['madam', 'speaker', ',', 'madam', 'vice', 'president', ',', 'our', 'first', 'lady', 'and', 'second', 'gentleman', '.', 'members', 'of', 'congress', 'and', 'the', 'cabinet', '.', 'justices', 'of', 'the', 'supreme', 'court', '.', 'my', 'fellow', 'americans']
