# Simple Rag Pipeline

### 1. Load the Requirements

In [54]:
!pip install langchain
!pip install chromadb
!pip install pypdf
!pip install pytest

[0m

### 2. Load the PDF

In [1]:
from langchain_community.document_loaders import PyPDFLoader
def load_doc(DATA_PATH):
    doc_loader = PyPDFLoader(DATA_PATH)
    return doc_loader.load_and_split()

In [2]:
doc = load_doc('')# Path to your pdf
doc

[Document(page_content='RAUL GONZALEZ RAMOS\n@raul_phys@hotmail.com ♂phone+44 7734625661 /linkedinraul-gonzalez-19275201phy\nPROFILE\nI am a highly motivated MSc Physics graduate from the University of Bristol specialising in Quantum Thermodynamics\nand driven by a profound interest in Quantum Information Theory. I am proﬁcient in problem-solving, data analysis, and\nstatistical techniques developed through coursework and a research project. On a personal level, I enjoy spending time\nwith people, doing sports, reading books (specially if written by Carlos Ruiz Zafon) and playing video games. I consider\nhumour to be a fundamental part of life.\nEDUCATION\nM.Sc. in Physics (2:1)\nUniversity of Bristol\nὌ5Oct 2019 – 2023\nCourses include: Computational Physics; Quantum Information Theory; Advanced Quantum Mechanics; Relativistic Field Theory;\nGeneral Relativity and Cosmology; Methods for Theoretical Physicist; Analytical Mechanics; Solid State Physics; Thermal Physics;\nProjects\nExplo

### 3.Split the document

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_doc(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


In [4]:
chunks = split_doc(doc)
chunks[1]

Document(page_content='EDUCATION\nM.Sc. in Physics (2:1)\nUniversity of Bristol\nὌ5Oct 2019 – 2023\nCourses include: Computational Physics; Quantum Information Theory; Advanced Quantum Mechanics; Relativistic Field Theory;\nGeneral Relativity and Cosmology; Methods for Theoretical Physicist; Analytical Mechanics; Solid State Physics; Thermal Physics;\nProjects\nExploring Quantum Non-Equilibrium Steady States as Thermal Machines\nQuantum Thermodynamics. M.Sc. Project.\n•Aim: To test the validity regime of the Thermodynamic Uncertainty Relation (TUR) by simulating the interaction between\ntwo qubits performing work on a load.\n•Methods: Producing two codes with diﬀerent approaches, Schrödinger and Heisenberg pictures, to calculate numerically', metadata={'source': 'RAUL GONZALEZ RAMOS (1).pdf', 'page': 0})

### 3. Get the Embeddings

In [5]:
from langchain_community.embeddings import OllamaEmbeddings

def get_embedding_func():
    embeddings = OllamaEmbeddings()
    return embeddings



In [25]:
#source = chunks[0].metadata.get('source')
#print(source)

### 4. creating unique chunk IDs

In [8]:
last_page_id = None
current_chunk_index = 0
for chunk in chunks:
    source = chunk.metadata.get('source')
    page =  chunk.metadata.get('page')
    current_page_id = f"{source}:{page}"

    if current_page_id == last_page_id:
        current_chunk_index += 1
    else:
        current_chunk_index = 0

    chunk_id = f"{current_page_id}:{current_chunk_index}"
    last_page_id = current_page_id

    chunk.metadata['id'] = chunk_id

    
    

### 5. Creating and Adding to the db

In [13]:
from langchain.vectorstores.chroma import Chroma
CHROMA_PATH = "chroma3"


db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_func()
    )
existing_items = db.get(include=[])
existing_ids = set(existing_items['ids'])
print(f"number of existing documents in DB: {len(existing_ids)}")

new_chunks = []
for chunk in chunks:
    if chunk.metadata['id'] not in existing_ids: 
        new_chunks.append(chunk)

new_chunk_ids = [chunk.metadata['id'] for chunk in new_chunks]
db.add_documents(new_chunks, ids=new_chunk_ids)
db.persist

number of existing documents in DB: 0


<bound method Chroma.persist of <langchain_community.vectorstores.chroma.Chroma object at 0x7ff7fde4b6a0>>

### 6. Querying

In [22]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""
def query_rag(query_text: str):
    embedding_function = get_embedding_func()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    results = db.similarity_search_with_score(query_text, k=3)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    model = Ollama(model= "llama2")
    response_text = model.invoke(prompt)

    source = [doc.metadata.get('id') for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSource: {source}"
    
    return formatted_response

query_rag("where did Raul work as a waiter?")
    

"Response:  Based on the information provided in the context, Raul worked as a waiter at Tapas Revolution Restaurant in Bath, UK.\nSource: ['RAUL GONZALEZ RAMOS (1).pdf:0:3', 'RAUL GONZALEZ RAMOS (1).pdf:0:0', 'RAUL GONZALEZ RAMOS (1).pdf:0:1']"