<h1>PDF RAG</h1>

In [1]:
!pip list

Package                                  Version
---------------------------------------- --------------
aiohappyeyeballs                         2.4.4
aiohttp                                  3.11.10
aiosignal                                1.3.2
altair                                   5.5.0
annotated-types                          0.7.0
anyio                                    4.7.0
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
arrow                                    1.3.0
asgiref                                  3.8.1
asttokens                                3.0.0
async-lru                                2.0.4
attrs                                    24.3.0
babel                                    2.16.0
backoff                                  2.2.1
bcrypt                                   4.2.1
beautifulsoup4                           4.12.3
bleach                                   6.2.0
blinker                                  1

Import Libraries

In [2]:
import os, sys
import pymupdf
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


from IPython.display import display as Markdown

# Set directory paths
parent_dir = os.path.abspath("..")
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

<h2>Load PDF</h2>

In [3]:
# file_name = 'developer_job.pdf'
# folder_path = 'data'
# file_path = parent_dir + os.sep + folder_path + os.sep + file_name

# # PDF file uploads
# if file_path:
#   loader = UnstructuredPDFLoader(file_path) #Wheel issue with 3.13
#   docs = loader.load()
# else:
#   print("Upload a PDF file")

file_name = 'developer_job.pdf'
folder_path = 'data'
file_path = os.path.join(os.getcwd(), folder_path, file_name)

def load_pdf_with_pymupdf(file_path):
    """
    Load text from a PDF file using PyMuPDF (fitz).
    Returns the extracted text as a string.
    """
    try:
        doc = pymupdf.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return None
    
# PDF file uploads
if os.path.exists(file_path):
    pdf_text = load_pdf_with_pymupdf(file_path)
    if pdf_text:
        print(f"PDF loaded successfully! Extracted text:\n{pdf_text[:500]}...")  
    else:
        print("Failed to extract text.")
else:
    print("Upload a valid PDF file.")


PDF loaded successfully! Extracted text:
Search...
 Search results
ome
>
Information Technology
>
Developer
>
Fullstack Developer
> Job details
Apply now
Register and upload your CV to apply with just one click
View all jobs
Senior Full Stack Developer
Featured
Senior Full Stack Developer
Posted 3 days ago by Kensington Mortgage Company
Work from home
Salary
negotiable
London, South East
England
Permanent,
full-time
Register CV
Sign in
Saved jobs
Senior Full Stack Developer in London - Reed.co.uk
https://www.reed.co.uk/jobs/senior-full...


List of local LLMS for Ollama

In [4]:
!ollama list

NAME                ID              SIZE      MODIFIED   
mistral:latest      f974a74358d6    4.1 GB    3 days ago    
tinyllama:latest    2644915ede35    637 MB    5 days ago    


In [5]:
# Split and chunk 
documents = [Document(page_content=pdf_text, metadata={})]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [6]:
# Create vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="mistral:latest"),
    collection_name="local-rag"
)

In [12]:
# LLM from Ollama
local_model = "mistral:latest"
llm = ChatOllama(model=local_model)

In [13]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five question-answering variations of the given user question to retrieve relevant documents from a vector database. By framing the query as potential answers to a question, your goal is to identify documents that directly address the user's information need. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [14]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [15]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
chain.invoke("What are the skills needed for the job?")

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


'1. C#\n   2. Python\n   3. Azure\n   4. Cloud technology\n   5. Senior developer\n   6. Full stack'

In [12]:
chain.invoke("What is the least amount of year experience needed for the job role?")

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


' The job listing does not specify a minimum number of years of experience required. However, it is indicated that the applicant should have a strong background in software development and experience developing C# models.'