<h1>PDF RAG</h1>

In [None]:
!pip list

Import Libraries

In [8]:
import os, sys
import pymupdf
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


from IPython.display import display as Markdown

# Set directory paths
parent_dir = os.path.abspath("..")
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

<h2>Load PDF</h2>

In [None]:
# file_name = 'developer_job.pdf'
# folder_path = 'data'
# file_path = parent_dir + os.sep + folder_path + os.sep + file_name

# # PDF file uploads
# if file_path:
#   loader = UnstructuredPDFLoader(file_path) #Wheel issue with 3.13
#   docs = loader.load()
# else:
#   print("Upload a PDF file")

file_name = 'developer_job.pdf'
folder_path = 'data'
file_path = os.path.join(os.getcwd(), folder_path, file_name)

def load_pdf_with_pymupdf(file_path):
    """
    Load text from a PDF file using PyMuPDF (fitz).
    Returns the extracted text as a string.
    """
    try:
        doc = pymupdf.open(stream=file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return None
    
# PDF file uploads
if os.path.exists(file_path):
    pdf_text = load_pdf_with_pymupdf(file_path)
    if pdf_text:
        print(f"PDF loaded successfully! Extracted text:\n{pdf_text[:500]}...")  
    else:
        print("Failed to extract text.")
else:
    print("Upload a valid PDF file.")


List of local LLMS for Ollama

In [None]:
!ollama list

In [None]:
# Split and chunk 
documents = [Document(page_content=pdf_text, metadata={})]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [21]:
# Create vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="mistral:latest"),
    collection_name="local-rag"
)

In [22]:
# LLM from Ollama
local_model = "llama3.1:latest"
llm = ChatOllama(model=local_model)

In [28]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five question-answering variations of the given user question to retrieve relevant documents from a vector database. By framing the query as potential answers to a question, your goal is to identify documents that directly address the user's information need. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [29]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [30]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("What are the skills needed for the job?")

In [None]:
chain.invoke("What is the least amount of year experience needed for the job role?")