# Build RAG vector database using FAISS and query the document

This notebook takes you through step by step process of creating a RAG based chatbot

In [2]:
! pip install -U langchain langchain-community openai langchain_openai faiss-cpu pypdf # (newest versions required for multi-modal)



In [3]:
# Import all necessary libraries
import requests
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [4]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")


Enter API key for OpenAI:  ········


In [5]:
# Initialize vector store and embeddings
vectorstore = None
embeddings = OpenAIEmbeddings()

# Define a prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are an intelligent assistant. Use the following context to answer the user's question accurately:

    Context: {context}

    Question: {question}

    Answer: """
)

In [6]:
# Write the helper functions
def process_documents(file_paths):
    print(file_paths)
    """Processes and updates the vector store with new documents."""
    global vectorstore
    documents = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        
        try:
            loader = PyPDFLoader(file_path) if file_path.endswith(".pdf") else TextLoader(file_path)
            documents.extend(loader.load())
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue

    docs = text_splitter.split_documents(documents)

    # Create or update vector store
    if vectorstore is None:
        vectorstore = FAISS.from_documents(docs, embeddings)
    else:
        vectorstore.add_documents(docs)
    print(vectorstore.index.ntotal)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def get_qa_chain():
    """Creates and returns a RetrievalQA chain."""
    if vectorstore is None:
        raise ValueError("Vector store is not initialized. Upload documents first.")
    prompt = hub.pull("rlm/rag-prompt")
    retriever = vectorstore.as_retriever()
    qa_chain = (
                    {
                        "context": retriever | format_docs,
                        "question": RunnablePassthrough(),
                    }
                    | prompt
                    | ChatOpenAI()
                    | StrOutputParser()
                )

    return qa_chain

In [7]:
# Download sample files for RAG
def download_pdf(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded successfully as {filename}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")

download_urls = ['https://s1.q4cdn.com/806093406/files/doc_financials/2024/q3/FY24-Q3-Combined-NIKE-Press-Release-Schedules-FINAL.pdf',
                'https://about.puma.com/sites/default/files/financial-report/2024/puma-q3-2024-release-english-final.pdf']

filepaths = ['Nike_q4_report.pdf','puma_q4_report.pdf']

for i in range(len(filepaths)):
    download_pdf(download_urls[i], filepaths[i])


PDF downloaded successfully as Nike_q4_report.pdf
PDF downloaded successfully as puma_q4_report.pdf


In [9]:
# Create Vector embeddings
process_documents(filepaths)

['Nike_q4_report.pdf', 'puma_q4_report.pdf']
Processing file: Nike_q4_report.pdf
Processing file: puma_q4_report.pdf
49


In [10]:
# Build QA Chain
qa_chain = get_qa_chain()



In [15]:
# query the documents
question = 'Which company is performing better in between Nike and Puma'
response = qa_chain.invoke(question)
print(response)

Based on the provided context, PUMA appears to be performing better than Nike. PUMA has achieved various successes in athletics and sports partnerships, as well as positive feedback from consumers and retail partners. The brand's momentum and growth are highlighted, indicating a strong performance compared to Nike.
