# Document Question Answering with local persistence

An example of using Chroma DB and LangChain to do question answering over documents, with a locally persisted database. 
You can store embeddings and documents, then use them again later.

In [None]:
from dotenv import load_dotenv 

# Load the environment variables from .env
load_dotenv()

: 

In [2]:
#Text Loader
from langchain.document_loaders import TextLoader

#Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter


#Embeddings model Hugging Face Transformers
from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings #Ejecución local
#from langchain_community.embeddings import HuggingFaceHubEmbeddings #Legacy
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings #Ejecución en servidores HuggingFace

#vector store
from langchain_chroma import Chroma



## Load and process documents

Load documents to do question answering over. If you want to do this over your documents, this is the section you should replace.

Next we split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

In [None]:
# Load and process the text
loader = TextLoader('state_of_the_union.txt',encoding='UTF-8')
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100, )
texts = text_splitter.split_documents(documents)
len(texts)

In [None]:
print("0",texts[0].page_content, len(texts[0].page_content))
print("1",texts[1].page_content, len(texts[1].page_content))
print(texts[2])

In [None]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, )
texts = text_splitter.split_documents(documents)
len(texts)

In [None]:
for i in range( 5):
    print(i,texts[i].page_content, len(texts[i].page_content))


## Initialize PeristedChromaDB

Create embeddings for each chunk and insert into the Chroma vector database. The `persist_directory` argument tells ChromaDB where to store the database when it's persisted. 

In [9]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'ChromaDB'
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding = HuggingFaceEndpointEmbeddings(model=model_name)
vectordb = Chroma.from_documents(documents=texts[0:30], embedding=embedding, persist_directory=persist_directory)

In [None]:
vectordb.similarity_search("How can we react?")


In [None]:
vectordb.max_marginal_relevance_search("How can we react?")

In [None]:
vectordb.add_documents(texts[31:2])

### Max tokens by vector

In [None]:
from sentence_transformers import SentenceTransformer
embeding_model : SentenceTransformer = SentenceTransformer(model_name)


In [None]:
embeding_model.get_max_seq_length()

In [None]:
embeding_model.tokenizer

In [16]:
from transformers import MPNetTokenizer

tokenizer = MPNetTokenizer.from_pretrained(model_name)

In [None]:
tokens=tokenizer.tokenize(texts[0].page_content)
print(len(tokens),tokens[:30])