# Simple Free RAG Implementation
## Minimal Dependencies Version

This notebook provides a simpler RAG implementation that avoids complex dependency issues:
- Uses basic langchain components
- Minimal dependencies
- Works on Windows without compilation issues

In [None]:
# Install minimal required packages
# Run this cell if packages are not installed
%pip install langchain langchain-community pypdf chromadb

In [2]:
# Import minimal required libraries
import os
from dotenv import load_dotenv
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
import hashlib
import numpy as np

# Load environment variables
load_dotenv()
print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [3]:
# Simple custom embeddings class (for demonstration)
# This creates basic embeddings without external dependencies
class SimpleEmbeddings(Embeddings):
    """Simple embeddings using hash-based vectors for demonstration"""
    
    def __init__(self, embedding_dim=384):
        self.embedding_dim = embedding_dim
    
    def embed_documents(self, texts):
        """Embed a list of documents"""
        return [self._embed_text(text) for text in texts]
    
    def embed_query(self, text):
        """Embed a query text"""
        return self._embed_text(text)
    
    def _embed_text(self, text):
        """Create a simple embedding from text using hashing"""
        # This is a simple demonstration - in production use real embeddings
        # Create deterministic "embedding" from text
        hash_digest = hashlib.sha256(text.encode()).digest()
        # Convert to float array
        embedding = np.frombuffer(hash_digest, dtype=np.uint8).astype(np.float32)
        # Normalize and resize to embedding_dim
        embedding = embedding / 255.0
        # Repeat or truncate to match embedding_dim
        if len(embedding) < self.embedding_dim:
            embedding = np.pad(embedding, (0, self.embedding_dim - len(embedding)))
        else:
            embedding = embedding[:self.embedding_dim]
        return embedding.tolist()

# Initialize simple embeddings
embeddings = SimpleEmbeddings()
print("✅ Simple embeddings initialized!")
print(f"📊 Embedding dimension: {embeddings.embedding_dim}")

✅ Simple embeddings initialized!
📊 Embedding dimension: 384


In [4]:
# Document loading functions
def read_doc(directory):
    """Load PDF documents from directory"""
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    """Split documents into chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(docs)
    return chunks

print("📚 Document processing functions ready!")

📚 Document processing functions ready!


In [5]:
# Load and process documents
print("📚 Loading documents...")
doc = read_doc('documents/')
print(f"✅ Loaded {len(doc)} documents")

print("✂️ Splitting documents into chunks...")
documents = chunk_data(docs=doc)
print(f"✅ Created {len(documents)} document chunks")

📚 Loading documents...
✅ Loaded 28 documents
✂️ Splitting documents into chunks...
✅ Created 45 document chunks


In [6]:
# Create local vector store using Chroma
print("🔄 Creating local vector store...")
print("⏳ This may take a few minutes for large documents...")

# Chroma will store vectors locally
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

print("✅ Vector store created successfully!")
print(f"📊 Indexed {len(documents)} document chunks")

🔄 Creating local vector store...
⏳ This may take a few minutes for large documents...
✅ Vector store created successfully!
📊 Indexed 45 document chunks


In [8]:
# Simple retrieval function
def retrieve_similar_chunks(query, k=3):
    """Retrieve k most similar document chunks"""
    results = vectorstore.similarity_search(query, k=k)
    return results

# Simple answer generation (without LLM)
def generate_simple_answer(query, relevant_chunks):
    """Generate a simple answer by extracting relevant sentences"""
    print(f"\n🔍 Query: {query}")
    print(f"📚 Found {len(relevant_chunks)} relevant chunks\n")
    
    # Extract sentences containing query keywords
    query_words = query.lower().split()
    relevant_sentences = []
    
    for chunk in relevant_chunks:
        sentences = chunk.page_content.split('.')
        for sentence in sentences:
            if any(word in sentence.lower() for word in query_words):
                relevant_sentences.append(sentence.strip())
    
    # Return unique sentences
    unique_sentences = list(dict.fromkeys(relevant_sentences))
    return '. '.join(unique_sentences[:3]) + '.'

print("🛠️ Retrieval functions ready!")

🛠️ Retrieval functions ready!


In [9]:
# Test the simple RAG system
test_query = "What are the available products?"

print("🚀 Testing Simple RAG System")
print("=" * 50)

# Retrieve relevant chunks
relevant_chunks = retrieve_similar_chunks(test_query)

# Generate simple answer
answer = generate_simple_answer(test_query, relevant_chunks)

print("\n📋 ANSWER:")
print("-" * 20)
print(answer)

print("\n📚 SOURCE CHUNKS:")
print("-" * 30)
for i, chunk in enumerate(relevant_chunks, 1):
    print(f"\n📄 Chunk {i}:")
    print(f"Source: {chunk.metadata.get('source', 'Unknown')}")
    print(f"Page: {chunk.metadata.get('page', 'Unknown')}")
    print(f"Preview: {chunk.page_content[:200]}...")

🚀 Testing Simple RAG System

🔍 Query: What are the available products?
📚 Found 3 relevant chunks


📋 ANSWER:
--------------------
Given the increasing demand for healthcare services and the inadequacy of Government's intervention in 
the sector, private sector participation has been on the rise across the entire value chain of the sector. Consequently, greater emphasis is placed on the growing private healthcare  market, which makes it a 
focal point for prospective investors to examine. Our goal in the sector is to:
HEALTH
Improve healthcare delivery 
infrastructure through Equipment 
Financing
Improve access to medical technology that also improves 
health business by providing access and information to 
practitioners in the sector.

📚 SOURCE CHUNKS:
------------------------------

📄 Chunk 1:
Source: documents\At-The-HEART-of-Sterling.pdf
Page: 17
Preview: Given the increasing demand for healthcare services and the inadequacy of Government's intervention in 
the sector, private secto

In [13]:
# Interactive query function
def ask_question(question):
    """Ask a question and get relevant information"""
    chunks = retrieve_similar_chunks(question, k=3)
    answer = generate_simple_answer(question, chunks)
    
    print(f"\n❓ Question: {question}")
    print("=" * 60)
    print(f"\n💡 Answer: {answer}")
    print(f"\n📊 Based on {len(chunks)} relevant chunk(s)")
    
    return answer, chunks

# Test with different questions
questions = [
    "Which bank supports the agriculture sector?",
    "What is the agriculture credit target?",
    "What initiatives are mentioned for farmers?",
    "Tell me about fisheries"
]

print("🎯 Testing multiple queries:")
for q in questions:
    ask_question(q)
    print("\n" + "="*80 + "\n")

🎯 Testing multiple queries:

🔍 Query: Which bank supports the agriculture sector?
📚 Found 3 relevant chunks


❓ Question: Which bank supports the agriculture sector?

💡 Answer: 2006 - 2010 
THE BIRTHING PROCESS
OUR HERITAGE
Establish a foothold
to gain market share
Create a distinct 
brand identity. Grow our retail footprint
by investing in technology
and service channel growth
Adopt social media 
to deepen customer 
interactions
Integrate our people
following the M&A
Sustainable solutions 
to reposition us
as a key competitor
Beef up capital
We navigated through these years to:
Sterling Bank was born out of a merger of five 
Nigerian banks in a bid to achieve compliance 
with the regulatory requirement mandating a 
N25 billion capital base for Nigerian banks. These banks were predominantly investment 
banks with little retail footprint.

📊 Based on 3 relevant chunk(s)



🔍 Query: What is the agriculture credit target?
📚 Found 3 relevant chunks


❓ Question: What is the agriculture cre

## 🎉 Simple RAG System Summary

### ✅ What This Version Provides:
- **Minimal Dependencies**: Only uses basic langchain and chromadb
- **No Compilation Issues**: Works on Windows without Rust/C++ requirements
- **Local Storage**: Uses ChromaDB for local vector storage
- **Simple Embeddings**: Basic demonstration embeddings (replace with real ones in production)
- **No LLM Required**: Simple keyword-based answer extraction

### 🚀 Upgrading to Production:

1. **Better Embeddings**: Replace SimpleEmbeddings with:
   ```python
   # Option 1: Use OpenAI (requires API key)
   from langchain.embeddings import OpenAIEmbeddings
   embeddings = OpenAIEmbeddings()
   
   # Option 2: Use local embeddings (when dependencies work)
   from langchain.embeddings import HuggingFaceEmbeddings
   embeddings = HuggingFaceEmbeddings()
   ```

2. **Add LLM for Better Answers**:
   ```python
   # Local LLM (when dependencies work)
   from langchain.llms import LlamaCpp
   llm = LlamaCpp(model_path="./models/llama-2-7b.bin")
   ```

3. **Use Pinecone for Cloud Storage**:
   ```python
   from langchain.vectorstores import Pinecone
   vectorstore = Pinecone.from_documents(documents, embeddings)
   ```

### 📝 Notes:
- This is a simplified version for development/testing
- The embeddings are basic - use real embeddings for better results
- Answer generation is keyword-based - add LLM for natural responses
- ChromaDB stores vectors locally in `./chroma_db` directory