# https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa

In [11]:
from langchain.chains import RetrievalQA
# from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import AzureChatOpenAI
#
from get_env_config import *
os.environ["OPENAI_API_KEY"] = "47d01b4f5ee14669b86a3b64ef1c1aaa"

In [4]:
loader = PyPDFLoader("Data/ThermoFisher_1.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
print (len(docs))
#
embeddings = OpenAIEmbeddings(chunk_size=1)
docsearch  = FAISS.from_documents(docs, embeddings)

96


In [10]:
def load_new_llm(temperature=0):
    llm = AzureChatOpenAI(temperature=temperature, deployment_name=os.getenv('deployment'), model_name=os.getenv('model'), verbose=True) # type: ignore
    return llm

### chain_type = "stuff"

Stuffing is the simplest method. You simply stuff all data into the prompt as context to pass to the language model.

Pros: It makes a single call to the LLM. The LLM has access to all data.
Cons. LLMs have a context length and for large documents or many documents this will not work as it will result in a prompt larger than the context length.


In [12]:
qa = RetrievalQA.from_chain_type(llm=load_new_llm(), chain_type="stuff", retriever=docsearch.as_retriever())
print(qa)
#
query = "What did the president say about ensuring safe drinking water?"
qa.run(query)

memory=None callbacks=None callback_manager=None verbose=False tags=None metadata=None combine_documents_chain=StuffDocumentsChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, input_key='input_documents', output_key='output_text', llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=ChatPromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], output_parser=None, partial_variables={}, template="Use the following pieces of context to answer the users question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}", template_format='f-string', validate_template=True), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], output_parser

"The president mentioned that the conflict in Ukraine has devastated a significant part of the country's infrastructure and left many families struggling to access necessities like clean and safe water. Thermo Fisher is helping to address this challenge by providing the Ukraine Ministry of Health with ion chromatography instruments, autosamplers, and water purification systems. These will be used in remote monitoring stations across the country to ensure the safety of drinking water. The president also highlighted that Thermo Fisher is helping to ensure the quality of water resources worldwide by providing government and industrial laboratories with solutions for environmental water testing, as water is vital to life."

### chain_type = "map_reduce"

This basically takes all the chunks, passes them along with the question to a language model, gets back a response, and then uses another language model call to summarize all of the individual responses into a final answer. This is really powerful because it can operate over any number of documents. And it's also really powerful because you can do the individual questions in parallel. But it does take a lot more calls. And it does treat all the documents as independent, which may not always be the most desired thing. 

In [15]:
qa = RetrievalQA.from_chain_type(llm=load_new_llm(), chain_type="map_reduce", retriever=docsearch.as_retriever(search_type="mmr"))
print (qa)
#
query = "What did the president say about ensuring safe drinking water?"
qa.run(query)

memory=None callbacks=None callback_manager=None verbose=False tags=None metadata=None combine_documents_chain=MapReduceDocumentsChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, input_key='input_documents', output_key='output_text', llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=ChatPromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], output_parser=None, partial_variables={}, template='Use the following portion of a long document to see if any of the text is relevant to answer the question. \nReturn any relevant text verbatim.\n______________________\n{context}', template_format='f-string', validate_template=True), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], output_parser=None,

'The president said that Thermo Fisher is helping to address the challenge of safe drinking water in Ukraine by providing the Ukraine Ministry of Health with ion chromatography instruments, autosamplers, and water purification systems, which will be used in remote monitoring stations across the country to ensure the safety of drinking water.'

### load_qa_chain

In [14]:
from langchain.chains.question_answering import load_qa_chain
qa_chain = load_qa_chain(llm=load_new_llm(), chain_type="map_reduce")
qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=docsearch.as_retriever())
print (qa)
#
query = "What did the president say about ensuring safe drinking water?"
qa.run(query)

memory=None callbacks=None callback_manager=None verbose=False tags=None metadata=None combine_documents_chain=MapReduceDocumentsChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, input_key='input_documents', output_key='output_text', llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=ChatPromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], output_parser=None, partial_variables={}, template='Use the following portion of a long document to see if any of the text is relevant to answer the question. \nReturn any relevant text verbatim.\n______________________\n{context}', template_format='f-string', validate_template=True), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], output_parser=None,

'The president said that Thermo Fisher is helping to address the challenge of safe drinking water in Ukraine by providing the Ukraine Ministry of Health with ion chromatography instruments, autosamplers, and water purification systems, which will be used in remote monitoring stations across the country to ensure the safety of drinking water.'