<h1>PDF RAG</h1>

In [16]:
!pip list

Package                                  Version
---------------------------------------- ------------------
aiofiles                                 24.1.0
aiohappyeyeballs                         2.4.4
aiohttp                                  3.11.10
aiosignal                                1.3.2
altair                                   5.5.0
annotated-types                          0.7.0
anyio                                    4.6.2.post1
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
arrow                                    1.3.0
asgiref                                  3.8.1
asttokens                                2.4.1
async-lru                                2.0.4
attrs                                    24.2.0
babel                                    2.16.0
backoff                                  2.2.1
bcrypt                                   4.2.1
beautifulsoup4                           4.12.3
bleach                         

Import Libraries

In [17]:
import os, sys
import pymupdf
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


from IPython.display import display as Markdown

# Set directory paths
parent_dir = os.path.abspath("..")
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

<h2>Load PDF</h2>

In [18]:
# file_name = 'developer_job.pdf'
# folder_path = 'data'
# file_path = parent_dir + os.sep + folder_path + os.sep + file_name

# # PDF file uploads
# if file_path:
#   loader = UnstructuredPDFLoader(file_path) #Wheel issue with 3.13
#   docs = loader.load()
# else:
#   print("Upload a PDF file")

file_name = 'developer_job.pdf'
folder_path = 'data'
file_path = os.path.join(os.getcwd(), folder_path, file_name)

def load_pdf_with_pymupdf(file_path):
    """
    Load text from a PDF file using PyMuPDF (fitz).
    Returns the extracted text as a string.
    """
    try:
        doc = pymupdf.open(stream=file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return None
    
# PDF file uploads
if os.path.exists(file_path):
    pdf_text = load_pdf_with_pymupdf(file_path)
    if pdf_text:
        print(f"PDF loaded successfully! Extracted text:\n{pdf_text[:500]}...")  
    else:
        print("Failed to extract text.")
else:
    print("Upload a valid PDF file.")


PDF loaded successfully! Extracted text:
Search...
 Search results
ome
>
Information Technology
>
Developer
>
Fullstack Developer
> Job details
Apply now
Register and upload your CV to apply with just one click
View all jobs
Senior Full Stack Developer
Featured
Senior Full Stack Developer
Posted 3 days ago by Kensington Mortgage Company
Work from home
Salary
negotiable
London, South East
England
Permanent,
full-time
Register CV
Sign in
Saved jobs
Senior Full Stack Developer in London - Reed.co.uk
https://www.reed.co.uk/jobs/senior-full...


List of local LLMS for Ollama

In [19]:
!ollama list

NAME                   ID              SIZE      MODIFIED     
llama3.1:latest        f66fc8dc39ea    4.7 GB    3 months ago    
phi3:medium            cf611a26b048    7.9 GB    3 months ago    
mistral-nemo:latest    994f3b8b7801    7.1 GB    4 months ago    
mistral:latest         2ae6f6dd7a3d    4.1 GB    6 months ago    


In [20]:
# Split and chunk 
documents = [Document(page_content=pdf_text, metadata={})]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [21]:
# Create vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="mistral:latest"),
    collection_name="local-rag"
)

In [22]:
# LLM from Ollama
local_model = "llama3.1:latest"
llm = ChatOllama(model=local_model)

In [28]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five question-answering variations of the given user question to retrieve relevant documents from a vector database. By framing the query as potential answers to a question, your goal is to identify documents that directly address the user's information need. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [29]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [30]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [31]:
chain.invoke("What are the skills needed for the job?")

'According to the job description, the required skills are:\n\n* Strong background in software development building cloud based applications\n* Experience developing c# models (yes or no question)\n* Strong C# / .NET Framework Programming skills\n* Strong knowledge of design patterns and experienced in designing software components\n* Strong experience in Microsoft .NET Parallel programming\n* Experience working with Azure batch is desirable\n* Demonstrable experience using python. Experience with any of the following is beneficial: numpy, pandas\n* Programming with C# / Microsoft Excel\n* Proficient in Database Development on MS SQL Server with T-SQL (not explicitly stated but can be inferred)\n* Cloud technology and senior developer skills for a Full Stack Developer role.'

In [32]:
chain.invoke("What is the least amount of year experience needed for the job role?")

'2 years. The job description specifies "2+ years demonstrable experience with Microsoft Azure" as one of the requirements.'