# Document Question Answering with local persistence

An example of using Chroma DB and LangChain to do question answering over documents, with a locally persisted database. 
You can store embeddings and documents, then use them again later.

In [17]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import PyPDFLoader, csv_loader, DirectoryLoader
from api import apikey

## Load and process documents

Load documents to do question answering over. If you want to do this over your documents, this is the section you should replace.

Next we split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

In [44]:
a = DirectoryLoader('pdfs/')

In [45]:
a

<langchain.document_loaders.directory.DirectoryLoader at 0x15e0591f390>

In [46]:
pdfss = []
csvs = []
# def vactorDB():
import os
for file in os.listdir('pdfs/'):
    if file.endswith((".pdf")):
        file_path = os.path.join('pdfs/', file)
        # print(file_path)
        from langchain.document_loaders import PyPDFLoader

        loader = PyPDFLoader(file_path)
        pages = loader.load()
        pdfss.append(pages)
    elif file.endswith((".csv")):
        file_path = os.path.join('pdfs/', file)
        # import pandas as pd
        # df = pd.read_csv(file_path)
        # df = df.to_json(orient='records', force_ascii=False, lines=True, default_handler=str)
        loader = csv_loader.CSVLoader(file_path)
        # print(loader)
        pages = loader.load()
        csvs.append(pages)
        # csvs.append(df)

In [47]:
final = pdfss + csvs

In [48]:
from api import apikey
embeddings = OpenAIEmbeddings(openai_api_key=apikey)
for i in range(len(final)):
    db = Chroma.from_documents(final[i], embeddings, persist_directory="mydb")
    db.persist()
    print(i)

# # db.get()

0
1


In [49]:
print(len(pdfss))
print(len(csvs))

2
0


In [50]:

print(len(final))

2


In [51]:
embeddings = OpenAIEmbeddings(openai_api_key=apikey)
db = Chroma(persist_directory="mydb", embedding_function=embeddings)

In [52]:
def retrieve_combined_documents(query, max_combined_docs=2):
    retriever = db.as_retriever(search_type="mmr")

    rev_doc = retriever.get_relevant_documents(query)
    lim_rev_doc = rev_doc[:max_combined_docs]

    docs = db.similarity_search(query)
    lim_docs = docs[:max_combined_docs]

    combined_docs = str(lim_rev_doc) + str(lim_docs)


    return combined_docs

In [53]:
retrieve_combined_documents('can you send me location of Bounce KHI?')

"[Document(page_content='Joytainment  \\nBooking Procedures for Corporate Events : \\n\\uf0b7 Process : For entertainment services tailored  to corporate events, requests should be forwarded \\nto the Joytainment team.  \\n\\uf0b7 Contact Information : Team members  can be reached at +923092228910 for bookings and \\ninquiries.  \\nAdditional Charges for Customizing Services : \\n\\uf0b7 Charges Communication : \\n \\nBounce (Ocean Mal l, Karachi) Overview:  \\nLocation:  \\nhttps://goo.gl/maps/LCA6E1iLLwmDL6GA8  \\nAddress:  2nd Floor, Ocean Towers, Karachi, Block 9 Clifton, Karachi, Karachi City, Sindh 75600  \\nOperating Hours:  \\n\\uf0b7 Monday -Thursday: 2 PM – 11 PM  \\n\\uf0b7 Friday: 3 PM – 11 PM  \\n\\uf0b7 Saturday: 12 PM – 12 AM  \\n\\uf0b7 Sunday: 12 PM – 11 PM  \\nAge Limit:  \\n\\uf0b7 Acceptable age for participation: 2.5 years and above.  \\nGeneral Customer Instructions:  \\n\\uf0b7 Mandatory use of grip socks in the arena.  \\n\\uf0b7 No gum, candies, food, or drinks

## Persist the Database
In a notebook, we should call `persist()` to ensure the embeddings are written to disk.
This isn't necessary in a script - the database will be automatically persisted when the client object is destroyed.

## Load the Database from disk, and create the chain
Be sure to pass the same `persist_directory` and `embedding_function` as you did when you instantiated the database. Initialize the chain we will use for question answering.

## Ask questions!

Now we can use the chain to ask questions!

## Cleanup

When you're done with the database, you can delete it from disk. You can delete the specific collection you're working with (if you have several), or delete the entire database by nuking the persistence directory.

In [10]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
def generate_prompt(prompt,system_prompt):
    
    system_prompt = """
I want you to act as Metutors platform Assistant, you job is to answer users query by using given context.
    if user ask in Arabic language your answers must be in Arabic otherwise in english. use the given data to generate answer.
    if you cant find any relevent information
    inside the context just ask dont tell him you dont have the data given user send contact on support on support@metutors.com.
        if user ask in Arabic language your answers should be in Arabic otherwise in english.
    This is important that your answers should be in detailed.
      use the given data to generate answer.
      
    """.strip()
    return f"""
    [INST] <>
    {system_prompt}
    <>

    {prompt} [/INST]
    """.strip()

In [63]:
SYSTEM_PROMPT = """
I want you to act as Metutors platform Assistant, you job is to answer users query by using given context.
if user ask in Arabic language your answers must be in Arabic otherwise in english. use the given data to generate answer.
if you cant find any relevent information
inside the context just ask dont tell him you dont have the data given user send contact on support on support@metutors.com.
    if user ask in Arabic language your answers should be in Arabic otherwise in english.
This is important that your answers should be in detailed.
    use the given data to generate answer.

    if user ask about any pricing find cost in the given context and tell him
    

"""
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY)
llm.model_name = "gpt-4-1106-preview"
template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)
# load from disk
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
db3 = Chroma(persist_directory="db", embedding_function=embeddings)
db3.get()
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
# path = os.getcwd()+"//new//"
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db3.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True
)

In [11]:
# import
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

# loader = PyPDFLoader("/content/merged_output.pdf")
# documents = loader.load()

# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# docs = text_splitter.split_documents(documents)

# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# db = Chroma.from_documents(db, embedding)

def retrieve_combined_documents(query, max_combined_docs=2):
    retriever = db.as_retriever(search_type="mmr")

    rev_doc = retriever.get_relevant_documents(query)
    lim_rev_doc = rev_doc

    docs = db.similarity_search(query)
    lim_docs = docs

    combined_docs = lim_rev_doc + lim_docs

    return combined_docs

In [12]:
a = retrieve_combined_documents('Bounce (Ocean Mall, Karachi) timing')

In [14]:
len(str(a))

8968