In [None]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv
import os

load_dotenv('.env', override=True)
print(os.getenv('OPENAI_API_KEY'))

### load Singple PDF file

In [None]:
loader = PyPDFLoader("../data/user-guide/Collection Manager User Guide.pdf")
docs = loader.load()

print(len(docs))
print(docs[0].page_content[0:100])
print(docs[0].metadata)

### load mutiple PDFs (multi-layer/subfolders tree)

In [None]:
docs = []
def load_pdfs(directory):
    for entry in os.listdir(directory):
        entry_path = os.path.join(directory, entry)
        if os.path.isfile(entry_path) and entry_path.endswith(".pdf"):
            loader = PyPDFLoader(entry_path)
            docs.extend(loader.load())
        elif os.path.isdir(entry_path):
            load_pdfs(entry_path)

load_pdfs('../data')
print(len(docs))

In [None]:
from langchain.document_loaders import DirectoryLoader

def pdf_filter(file_path):
    return file_path.endswith('.pdf')

loader = DirectoryLoader('../data', recursive=True, file_filter=pdf_filter)

docs = loader.load()
print(len(docs))

### Training RAG model

Train the RAG model using LangChain's `RetrievalQA` class and a pre-trained language model like BERT or GPT:

In [None]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4o")

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

llm = model
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "UiPartMetadata"})

results

In [None]:
results = rag_chain.invoke({"input": "where to find pip config debug"})
results

print(results["context"][0].page_content)


print(results["context"][0].metadata)