# Project: Question-Answering on Private Documents (RAG)

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

False

In [2]:
! pip install pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.4.0


In [3]:
! pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9


In [4]:
! pip install wikipedia



In [5]:
! pip install openai langchain tiktoken



## Loading Documents

In [6]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data


  

In [8]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data
  

In [9]:
load_from_wikipedia("who is elon musk?")

[Document(metadata={'title': 'Wealth of Elon Musk', 'summary': 'Elon Musk is the wealthiest person in the world, with an estimated net worth of US$330 billion as of March 2025, according to the Bloomberg Billionaires Index, and $359.5 billion according to Forbes, primarily from his ownership stakes in Tesla and SpaceX. \nHaving been first listed on the Forbes Billionaires List in 2012, around 75% of Musk\'s wealth was derived from Tesla stock in November 2020. Describing himself as "cash poor", he became the first person in the world to have a net worth above $300 billion a year later. By December 2024, he became the first person to reach a net worth of $400 billion.', 'source': 'https://en.wikipedia.org/wiki/Wealth_of_Elon_Musk'}, page_content='Elon Musk is the wealthiest person in the world, with an estimated net worth of US$330 billion as of March 2025, according to the Bloomberg Billionaires Index, and $359.5 billion according to Forbes, primarily from his ownership stakes in Tesla

In [4]:
load_document('react.pdf')

Loading react.pdf


[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-03-13T00:09:11+00:00', 'author': '', 'keywords': '', 'moddate': '2023-03-13T00:09:11+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'react.pdf', 'total_pages': 33, 'page': 0, 'page_label': '1'}, page_content='Published as a conference paper at ICLR 2023\nREAC T: S YNERGIZING REASONING AND ACTING IN\nLANGUAGE MODELS\nShunyu Yao∗*,1, Jeffrey Zhao2, Dian Yu2, Nan Du2, Izhak Shafran2, Karthik Narasimhan1, Yuan Cao2\n1Department of Computer Science, Princeton University\n2Google Research, Brain team\n1{shunyuy,karthikn}@princeton.edu\n2{jeffreyzhao,dianyu,dunan,izhak,yuancao}@google.com\nABSTRACT\nWhile large language models (LLMs) have demonstrated impressive performance\nacross tasks in language understanding and interactive decision making, their\nabilities 

## Chunking Data

In [10]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

## Calculating Cost

In [11]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

## Embedding and Uploading to a Vector Database (Pinecone)

In [12]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    
    pc = pinecone.Pinecone()
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(
                environment='gcp-starter'
            )
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

In [13]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')
    

## Asking and Getting Answers

In [14]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer
    

## Using Chroma as a Vector DB

In [15]:
! pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6.tar.gz (32 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.18.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14

  error: subprocess-exited-with-error
  
  × Building wheel for chroma-hnswlib (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [5 lines of output]
      running bdist_wheel
      running build
      running build_ext
      building 'hnswlib' extension
      error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for chroma-hnswlib
ERROR: Could not build wheels for chroma-hnswlib, which is required to install pyproject.toml-based projects


In [34]:
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate an embedding model from OpenAI (smaller version for efficiency)
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  

    # Create a Chroma vector store using the provided text chunks and embedding model, 
    # configuring it to save data to the specified directory 
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory) 

    return vector_store  # Return the created vector store


In [35]:
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate the same embedding model used during creation
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536) 

    # Load a Chroma vector store from the specified directory, using the provided embedding function
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) 

    return vector_store  # Return the loaded vector store


#### Running Code

In [36]:
# Loading the pdf document into LangChain 
data = load_document('files/rag_powered_by_google_search.pdf')

# Splitting the document into chunks
chunks = chunk_data(data, chunk_size=256)

# Creating a Chroma vector store using the provided text chunks and embedding model (default is text-embedding-3-small)
vector_store = create_embeddings_chroma(chunks)

Loading files/rag_powered_by_google_search.pdf


In [37]:
# Asking questions
q = 'What is Vertex AI Search?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is Vertex AI Search?', 'result': "Vertex AI Search is a component of Google's Vertex AI platform that offers customizable answers, search tuning, vector search, grounding, and compliance updates tailored for enterprises. It provides new generative AI capabilities and enterprise-ready features to enhance search functionalities and information retrieval for businesses."}


In [38]:
print(answer['result'])

Vertex AI Search is a component of Google's Vertex AI platform that offers customizable answers, search tuning, vector search, grounding, and compliance updates tailored for enterprises. It provides new generative AI capabilities and enterprise-ready features to enhance search functionalities and information retrieval for businesses.


In [39]:
# Load a Chroma vector store from the specified directory (default ./chroma_db) 
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'How many pairs of questions and answers had the StackOverflow dataset?', 'result': 'The StackOverflow dataset had 8 million pairs of questions and answers.'}


In [40]:
# We can't ask follow-up questions. There is no memory (chat history) available.
q = 'Multiply that number by 2.'
answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

I'm sorry, but I don't have a specific number from the provided context to multiply by 2.


## Adding Memory (Chat History)

In [41]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain  # Import class for building conversational AI chains 
from langchain.memory import ConversationBufferMemory  # Import memory for storing conversation history

# Instantiate a ChatGPT LLM (temperature controls randomness)
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)  

# Configure vector store to act as a retriever (finding similar items, returning top 5)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})  


# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Link the ChatGPT LLM
    retriever=retriever,  # Link the vector store based retriever
    memory=memory,  # Link the conversation memory
    chain_type='stuff',  # Specify the chain type
    verbose=False  # Set to True to enable verbose logging for debugging
)


In [42]:
# create a function to ask questions
def ask_question(q, chain):
    result = chain.invoke({'question': q})
    return result

In [43]:
data = load_document('files/rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading files/rag_powered_by_google_search.pdf


In [44]:
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result)

{'question': 'How many pairs of questions and answers had the StackOverflow dataset?', 'chat_history': [HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'), AIMessage(content='The StackOverflow dataset had 8 million pairs of questions and answers.')], 'answer': 'The StackOverflow dataset had 8 million pairs of questions and answers.'}


In [45]:
print(result['answer'])

The StackOverflow dataset had 8 million pairs of questions and answers.


In [46]:
q = 'Multiply that number by 10.'
result = ask_question(q, crc)

In [47]:
print(result['answer'])

The result of multiplying 8 million pairs of questions and answers by 10 would be 80 million pairs of questions and answers.


In [48]:
q = 'Devide the result by 80.'
result = ask_question(q, crc)
print(result['answer'])


The result of dividing 80 million pairs of questions and answers by 80 is 1 million.


In [49]:
for item in result['chat_history']:
    print(item)

content='How many pairs of questions and answers had the StackOverflow dataset?'
content='The StackOverflow dataset had 8 million pairs of questions and answers.'
content='Multiply that number by 10.'
content='The result of multiplying 8 million pairs of questions and answers by 10 would be 80 million pairs of questions and answers.'
content='Devide the result by 80.'
content='The result of dividing 80 million pairs of questions and answers by 80 is 1 million.'


### Loop for asking questions

In [50]:
while True:
    q = input('Your question: ')
    if q.lower() in 'exit quit bye':
        print('Bye bye!')
        break
    result = ask_question(q, crc)
    print(result['answer'])
    print('-' * 100)
    

Your question:  Tell me about Google Search technologies as described in the document.


The document mentions that Google Search technologies have enabled search experiences for public or internal websites, mobile applications, and various enterprise search services. It also highlights that the product is the result of deep collaboration between Google Search and other entities.
----------------------------------------------------------------------------------------------------


Your question:  Is Vertex AI Search a fully-managed platform?


Yes, Vertex AI Search is a fully-managed platform that offers customizable answers, search tuning, vector search, grounding, and compliance updates for enterprises.
----------------------------------------------------------------------------------------------------


Your question:  What advancements have been made in semantic search?


Advancements in semantic search include the ability to understand the context and intent behind a user's query, providing more relevant search results. This is achieved through natural language processing, entity recognition, and machine learning algorithms that help search engines understand the meaning of words and phrases in a query. Additionally, semantic search allows for better understanding of user intent, enabling search engines to deliver more accurate and personalized results.
----------------------------------------------------------------------------------------------------


Your question:  bye


## Using a Custom Prompt

In [68]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


system_template = r'''
Use the following pieces of context to answer the user's question.
Before answering translate your response to Spanish.
If you don't find the answer in the provided context, just respond "I don't know."
---------------
Context: ```{context}```
'''

user_template = '''
Question: ```{question}```
'''

messages= [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt = ChatPromptTemplate.from_messages(messages)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    combine_docs_chain_kwargs={'prompt': qa_prompt },
    verbose=True
)

In [69]:
print(qa_prompt)

input_variables=['context', 'question'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template='\nUse the following pieces of context to answer the user\'s question.\nBefore answering translate your response to Spanish.\nIf you don\'t find the answer in the provided context, just respond "I don\'t know."\n---------------\nContext: ```{context}```\n')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='\nQuestion: ```{question}```\n'))]


In [70]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer the user's question.
Before answering translate your response to Spanish.
If you don't find the answer in the provided context, just respond "I don't know."
---------------
Context: ```simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search 

In [71]:
q = 'When was Elon Musk born?'
result = ask_question(q, crc)
print(result)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many pairs of questions and answers had the StackOverflow dataset?
Assistant: El conjunto de datos de StackOverflow tenía 8 millones de pares de preguntas y respuestas.

Contexto: ```la búsqueda de similitud simple fue altamente efectiva porque el conjunto de datos tenía 8 millones de pares de preguntas y respuestas. Sin embargo, los conjuntos de datos generalmente no contienen preguntas y respuestas preexistentes o consultas y respuestas preexistentes.```
Follow Up Input: When was Elon Musk born?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer t

In [72]:
q = 'When was Bill Gates born?'
result = ask_question(q, crc)
print(result)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many pairs of questions and answers had the StackOverflow dataset?
Assistant: El conjunto de datos de StackOverflow tenía 8 millones de pares de preguntas y respuestas.

Contexto: ```la búsqueda de similitud simple fue altamente efectiva porque el conjunto de datos tenía 8 millones de pares de preguntas y respuestas. Sin embargo, los conjuntos de datos generalmente no contienen preguntas y respuestas preexistentes o consultas y respuestas preexistentes.```
Human: When was Elon Musk born?
Assistant: I don't know.
Follow Up Input: When was Bill Gates born?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;

In [73]:
print(result['answer'])

No sé.
