In [46]:
# RAG Dependencies
import os
import fitz  # PyMuPDF
import chromadb
from tqdm import tqdm
from spacy.lang.en import English
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
from typing import List
from IPython.display import display, Markdown

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [47]:
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cpu' and torch.backends.mps.is_available():
    device = torch.device('mps')

print(f"Using device: {device}")

Using device: mps


In [48]:
### Loading and Chunking the Text
def text_formatter(text: str) -> str:
    return text.replace("\n", " ").strip()

def open_and_read_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in tqdm(doc, desc="Reading PDF"):
        text = page.get_text()
        full_text += text_formatter(text) + " "
    return full_text.strip()

# Load the document
pdf_path = "nutrition_handbook.pdf"
document = open_and_read_pdf(pdf_path)

# Initialize spaCy for sentence splitting
nlp = English()
nlp.add_pipe("sentencizer")

# Split into sentences
doc = nlp(document)
sentences = [str(sent) for sent in doc.sents]

# Create chunks of sentences
def create_chunks(sentences, chunk_size=10, overlap=5):
    chunks = []
    for i in range(0, len(sentences), chunk_size - overlap):
        chunk = " ".join(sentences[i:i + chunk_size])
        if len(chunk) > 25:  # Minimum chunk size filter
            chunks.append(chunk)
    return chunks

chunks = create_chunks(sentences)
print(f"Number of chunks: {len(chunks)}\n")
print(f"First chunk sample:\n{chunks[0][:500]}")


Reading PDF: 100%|██████████| 100/100 [00:00<00:00, 551.55it/s]


Number of chunks: 203

First chunk sample:
FOOD AND NUTRITION HANDBOOK MINISTRY OF AGRICULTURE, ANIMAL INDUSTRY AND FISHERIES FOR EXTENSION WORKERS OCTOBER 2015 Fo o d  a n d  N u t ri t i o n  H a n d b o o k  fo r E xten s i o n  Wo rkers i MINISTRY OF AGRICULTURE, ANIMAL INDUSTRY AND FISHERIES FOOD AND NUTRITION HANDBOOK FOR EXTENSION WORKERS OCTOBER 2015  FANTA III F O O D  A N D  N U T R I T I O N T E C H N I C A L  A S S I S TA N C E Fo o d  a n d  N u t ri t i o n  H a n d b o o k  fo r E xten s i o n  Wo rkers ii   Foreword T he 


In [49]:
### Indexing into ChromaDB
path = "vector_db"
client = chromadb.PersistentClient(path=path)

In [50]:
# Initialize embedding function
embedding_model_name = 'all-MiniLM-L6-v2'
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embedding_model_name)

ties_collection = client.get_or_create_collection(
    name='ties_collection',
    embedding_function=embedding_function
)

# Add documents to collection
for i, chunk in enumerate(chunks):
    ties_collection.add(
        documents=[chunk],
        ids=[f"chunk_{i}"]
    )

# Test query
query = "What is a healthy breakfast?"
results = ties_collection.query(
    query_texts=[query],
    n_results=5
)

for i in range(len(results['documents'][0])):
    print("-------\n")
    print("Retrieved Doc: ", i+1)
    print("Chunk ID: ", results['ids'][0][i])
    print("Distance: ", results['distances'][0][i])
    print("Text Snippet: ", f"{results['documents'][0][i][:500]}...")

-------

Retrieved Doc:  1
Chunk ID:  chunk_64
Distance:  1.0473971366882324
Text Snippet:  Fo o d  a n d  N u t ri t i o n  H a n d b o o k  fo r E xten s i o n  Wo rkers 28 CHAPTER THREE MEAL PLANNING M eal planning is a process of determining, selecting and preparing  foods to ensure a balanced diet for target groups or individ­ uals. People will always want to eat food that is acceptable, therefore  knowing the consumers’ characteristics, food preferences and location  is important in proper meal planning. Common terms used in meal planning A meal is a selection of foods prepared a...
-------

Retrieved Doc:  2
Chunk ID:  chunk_69
Distance:  1.069419503211975
Text Snippet:  	 Within a week or the day take advantage of opportunities when  food supply is plentiful and cheap. For instance on market days or  times of the day when farmers are selling directly to urban consum­ ers. Selection of foods to achieve a balanced diet Energy giving foods the “GO” foods (carbohydrates) As much as

In [53]:
### LLM & Prompt Setup
# Initialize Llama model
model_id = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, token=True)
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16, 
                                                 low_cpu_mem_usage=False,
                                                 token=True)
llm_model.to(device)

def prompt_formatter(query: str, context_documents: List[str]) -> str:
    context = "\n".join([f"[{i+1}] {doc}" for i, doc in enumerate(context_documents)])
    base_prompt = f"""You are a knowledgeable assistant. Based on the following context, please answer the query concisely. If the information is not available in the context, please state that you don't have enough information to answer accurately.

Context:
{context}

Query: {query}

Answer:"""
    return base_prompt

Loading checkpoint shards: 100%|██████████| 4/4 [00:28<00:00,  7.08s/it]


KeyboardInterrupt: 

In [None]:
### Full RAG Pipeline
def rag_query(query: str, temperature=0.8, max_new_tokens=200) -> str:
    try:
        # Retrieve relevant documents
        results = ties_collection.query(query_texts=[query], n_results=5)
        documents = results['documents'][0]
        
        # Format prompt with retrieved context
        prompt = prompt_formatter(query, documents)
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
        
        # Generate response
        with torch.no_grad():
            outputs = llm_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                temperature=temperature,
                do_sample=True,
                max_new_tokens=max_new_tokens,
                top_p=0.95,
                top_k=50,
                repetition_penalty=1.2
            )
            
        # Decode and format response
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        answer = output_text[len(prompt):].strip()
        return answer
        
    except Exception as e:
        return f"An error occurred: {str(e)}. Please try again or rephrase your question."


In [None]:
# Test the pipeline
query = "What is a healthy breakfast?"
response = rag_query(query)
display(Markdown(response))