# 2024-03-06 
# Question-Answering System on private documents using openai, pinecone and langchain - OPL/APL

GPT models can answer questions based on training set from earlier. But what if the data is private or untrained earlier?.
How can LLMs learn new knowledge? 
1. Fine-tuning on a training set (expensive and time consuming)
2. Model inputs - (ideal and simple, limited by token size max (4000 tokens/chunks) etc)

Question-Answering pipeline 
1. Prepare the document (once per document)
   - Load the data into Langchain Documents
   - Split the documents into chunks
   - Embed the chunks into nuermic vectors
   - Save the chunks and the ebeddings to a vector database (Pinecone, Milvus or Quadrant)
  
2. Search (once per query)
   - Embed the user's question
   - Using the question's embeddings and the chunk embeddings, rank the vectors by similarity to the question's embedding. The nearest vectors represent the chunks similar to the question.
  
3. Ask (once per query)
   - Insert the question and the most relevant chunks into a message to a GPT model.
   - Return GPT's answer.
   - 

In [3]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)


True

In [None]:
# pip install -q pypdf

In [None]:
# pip install jq -q

In [None]:
# pip install unstructured -q


In [None]:
''' 
Loads pdfs using PyPDF and returns array of documents 
where each doc is content page, page number etc
'''
def load_document(file):
    import os
    name, extension = os.path.splitext(file)
    print(f'Loading {file}')

    if extension == '.pdf':  
        from langchain.document_loaders import PyPDFLoader
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)

    elif extension =='.md':
        # pip install unstructured > /dev/null
        from langchain_community.document_loaders import UnstructuredMarkdownLoader
        loader = UnstructuredMarkdownLoader(file)

    elif extension == '.json':
        # pip install jq
        from langchain_community.document_loaders import JSONLoader
        
        import json
        from pathlib import Path
        from pprint import pprint
        loader = JSONLoader(
            file_path=file,
            jq_schema='[]', # not sure how to parse the json content yet, refer to jq for the jq_schema
            text_content=True
        )
    else:
        print(f'Document format {extension} is not supported')
        return None
    
    data = loader.load()
    return data


Display data / Running Code

In [None]:
data = load_document('files/ghana_constitution.pdf')
print(data[1].page_content)
# print(data[10].metadata)
# print(len(data))
# print(f'There are {len(data[10].page_content)} characters in page 10')
# can also url of pdf into function and it will retrieve data as needed.

In [None]:
# data = load_document('files/sample_docx_file.docx')
# print(data[0].page_content)

In [None]:
# data = load_document('files/chat_history.json')
# print(data[0]) ## still not working, take some time to adjust jq_schema above

In [None]:
# data = load_document('README.md')
# print(data[0])

In [None]:
# loading wikipedia external loaders
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data



In [None]:
# data = load_from_wikipedia('GPT-4')
# print(data[0])

# Chunking
- is the process of breaking down large pieces of text into smaller segments.
- Its an essential technique that helps optimize the relevance of the content we get back from a vector database.
- 

In [None]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

    

In [None]:
# try loading data from file and chunking all at once
# data = load_document('files/ghana_constitution.pdf')
# chunks = chunk_data(data)
# print(f'We have {len(chunks)} in ghana constitution file of {len(data)} pages')


In [None]:
#print(chunks[0].page_content)

# Check and Print Embedding cost

In [None]:
# check embedding costs
def print_embedding_costs(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')



In [None]:
# print_embedding_costs(chunks)

Embedding and Uploading to a Vector Database (PineCone)

In [None]:
# inserting of embeddings method
def insert_or_fetch_embeddings(index_name, chunks):
    from pinecone import PodSpec
    import pinecone
    import os
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings

    embedding = OpenAIEmbeddings()
    pcone = pinecone.Pinecone()
    indexes = pcone.list_indexes().names()
    if index_name in indexes:
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existing_index(index_name, embedding)
        print('Ok\n')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pcone.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment='gcp-starter')
        )
        print(f'Index_name: {index_name}, Chunks: {len(chunks)}...', end='')
        vector_store = Pinecone.from_documents(chunks, embedding, index_name=index_name)
        print('Ok\n')
    
    return vector_store


Delete Index Function

In [4]:
# delete index function
def delete_pinecone_index(index_name='all'):
    from pinecone import Pinecone, ServerlessSpec
    import os
    
    pcone = Pinecone()
    if index_name == 'all':
        indexes = pcone.list_indexes().names()
        print(indexes)
        print('Deleting all indexes...')
        for index in indexes:
            pcone.delete_index(index)
        print('Ok\n')
    else:
        print(f'\nDeleting index {index_name} ...', end='')
        pcone.delete_index(index_name)
        print('Ok\n')
        

Ask and Getting Answers

In [None]:
# ask and get answer function
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI
    
    llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)
    answer = chain.invoke(q)
    return answer
    

Running Code

In [5]:
delete_pinecone_index()

[]
Deleting all indexes...
Ok



In [None]:
# test_pincecone_usage_q-n_a():
def test_pinecone_usage_q_n_a():
    # creating index and storing data in vectorstore
    index_name = 'askadocument'
    data = load_document('files/ghana_constitution.pdf')
    chunks = chunk_data(data)
    vector_store = insert_or_fetch_embeddings(index_name, chunks)
    #quering the vector store
    
    query = 'What is the whole document about?'
    answer = ask_and_get_answer(vector_store, query)
    print(answer)
    query = 'What is the legal age requirements of President of Ghana?'
    answer = ask_and_get_answer(vector_store, query)
    print(answer)


In [None]:
# test_pinecone_usage_q_n_a()

# Continuous Questions until quit

In [None]:
def ask_question_until_quit():
    import time
    i = 1
    print('Write/input Quit or Exit to quit.')
    while True:
        q = input(f'Question #{i}: ')
        i += 1
        if q.lower() in ['quit', 'exit']:
            print('Quiting ... bye bye!')
            time.sleep(2)
            break
        answer = ask_and_get_answer(vector_store, q)
        print(f'\nAnswer: {answer}')
        print(f'\n {"-" * 50} \n')


In [None]:
# ask_question_until_quit()


In [None]:
def load_wiki_data(topic):
    index_name = 'wikipedia'
    delete_pinecone_index()
    
    data = load_from_wikipedia(topic, 'en')
    chunks = chunk_data(data)
    print(f'\nData: {len(data)}, Chunks: {len(chunks)}\n', end='')
    
    vector_store = insert_or_fetch_embeddings(index_name, chunks)
    return vector_store


In [None]:
# ask wikipedia a question
# topic = 'French History'
# vector_store = load_wiki_data(topic)
# q = f'What is {topic}?'
# answer = ask_and_get_answer(vector_store, q)

In [None]:
#print(answer)

# RAG - Retrieval Augmented Generation
- helps overcome knowledge limits, makes answers more factual, and lets the model handle complex questions.
- 

# Using Chroma as a Vector DB
- install using pip install -q chromadb


In [None]:
# pip install -q chromadb

In [None]:
# Creating Embeddings - Chroma
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain_community.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embedding = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma.from_documents(chunks, embedding, persist_directory=persist_directory)
    return vector_store

    

In [None]:
# Loading Embeddings - Chroma
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain_community.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embedding = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding)
    return vector_store



In [None]:
# testing chroma usage
def test_chroma_usage():
    data = load_document('files/ghana_constitution.pdf')
    chunks = chunk_data(data)
    # vector_store = create_embeddings_chroma(chunks)
    # query = 'What is the whole document about?'
    # answer = ask_and_get_answer(vector_store, query)
    # print(answer)
    # query = 'What is the legal age requirements of President of Ghana?'
    # answer = ask_and_get_answer(vector_store, query)
    # print(answer)

    vector_store = load_embeddings_chroma()
    query = 'Who are the next inline to govern the country after the Vice-President?'
    answer = ask_and_get_answer(vector_store, query)
    print(answer)



In [None]:
# pip install chromadb -q

In [None]:
# test_chroma_usage()


# Memory- Adding Memory (Chat history)



In [None]:
def add_memory_to_chat():
    from langchain_openai import ChatOpenAI
    from langchain.chains import ConversationalRetrievalChain
    from langchain.memory import ConversationBufferMemory

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

    crc = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        chain_type='stuff',
        verbose=True
    )
    return crc


In [None]:
def ask_memorized_question(q, chain):
    result = chain.invoke({'question': q})
    return result



In [None]:
def test_chroma_memorized_usage():
    data = load_document('files/ghana_constitution.pdf')
    chunks = chunk_data(data)
    # vector_store = create_embeddings_chroma(chunks)
    # query = 'What is the whole document about?'
    # answer = ask_and_get_answer(vector_store, query)
    # print(answer)
    # query = 'What is the legal age requirements of President of Ghana?'
    # answer = ask_and_get_answer(vector_store, query)
    # print(answer)
    chain = add_memory_to_chat()
    vector_store = load_embeddings_chroma()
    query = 'Who are the next inline to govern the country after the Vice-President?'
    # answer = ask_and_get_answer(vector_store, query)
    answer = ask_memorized_question(query, chain)
    print(answer)



In [None]:
#test_chroma_memorized_usage()

# 20240307
# Using a Custom Prompt

In [None]:
def using_custom_prompts():
    from langchain_openai import ChatOpenAI
    from langchain.chains import ConversationalRetrievalChain
    from langchain.memory import ConversationBufferMemory
    from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

    llm = ChatOpenAI(model_name='gpt-3.5.-turbo', temperature=0)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
    memory = ConversationBufferMemory(memoryKey='chat_history', return_message=True)

    system_template = r'''
    User the following pieces of context to answer the user's question.
    --------------------
    Context: ```{context}```

    '''

    user_template = '''
    Question: ```{question}```
    Chat History: ```{chat_history}```
    '''

    messages = [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template(user_template)
    ]

    qa_prompt = ChatPromptTemplate.from_messages(messages)
    
    crc = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        chain_type='stuff',
        combine_docs_chain_kwargs={'prompt': qa_prompt},
        verbose=True
    )



In [None]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers has the StackOverfolw dataset?'
result = ask_question(q, crc)
print(result)