In [None]:
# Step 1: Install required packages
# llama-index: The main library for data parsing and vector indexing
# pymongo: MongoDB Python driver to connect to MongoDB Atlas from Colab
!pip install llama-index pymongo
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-openrouter

In [None]:
# Import the necessary module to mount Google Drive in Colab
from google.colab import drive

# Mount Google Drive to access its contents in your Colab environment
drive.mount('/content/drive', force_remount=True)

import os

# Define the desired input directory path, using a subfolder for organization
input_dir = "/content/drive/My Drive/Colab Notebooks/RAG"

# Check if the directory exists, and create it if it does not
if not os.path.isdir(input_dir):
    os.makedirs(input_dir)
    print(f"Created directory: {input_dir}")
else:
    print(f"Directory already exists: {input_dir}")


In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import HierarchicalNodeParser

# Load all files recursively from input_dir and subfolders
reader = SimpleDirectoryReader(input_dir=input_dir, recursive=True)
documents = reader.load_data()

# Initialize hierarchical parser (can customize chunk sizes, etc.)
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128],
    chunk_overlap=40 # Overlap each chunk by 20 tokens to preserve context
)

# Parse documents into hierarchical chunks preserving structure
nodes = node_parser.get_nodes_from_documents(documents)

# nodes now contain sophisticatedly parsed chunks with hierarchy info
print(f"Parsed {len(nodes)} hierarchical document chunks.")


In [None]:
# Import the required local embedding model
# from llama_index.core import GPTVectorStoreIndex
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding


# # Set up a local embedding model using Hugging Face transformers (free to use)
# embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

from openai import OpenAI
from google.colab import userdata

open_router_api=userdata.get('open_router_api')

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=open_router_api,
)

class OpenRouterEmbeddingWrapper:
    def __init__(self, client, model_name="openai/text-embedding-3-large"):
        self.client = client
        self.model_name = model_name

    def get_text_embedding(self, text):
        response = self.client.embeddings.create(
            model=self.model_name,
            input=text,
            encoding_format="float"
        )
        return response.data[0].embedding

embed_model = OpenRouterEmbeddingWrapper(client)

In [None]:
import certifi
from pymongo import MongoClient
from google.colab import userdata

db_user = userdata.get('db_user')
db_pass = userdata.get('db_pass')
db_name = userdata.get('db_name')

# MongoDB connection (replace with your credentials)
mongo_uri = f"mongodb+srv://{db_user}:{db_pass}@ai-master.w5go1ll.mongodb.net/?appName={db_name}"

# Initialize the MongoDB client and get the database instance
client = MongoClient(mongo_uri, tls=True, tlsCAFile=certifi.where())
db = client[db_name]

# Test the connection by listing collection names or pinging
try:
    # Option 1: List collections (will raise error if no connection)
    collections = db.list_collection_names()
    print("Connection test successful: Collections found:", collections)
except Exception as e:
    print("Connection test failed:", e)

In [None]:
# Generate embeddings for nodes and save to MongoDB Atlas

# Define your target database and collection
collection = db["documents_with_embeddings"]

# traverse and save each node
for node in nodes:
    text = node.get_text().strip()
    if text == "":
        continue   # skip empty chunks
    embedding = embed_model.get_text_embedding(node.text)
    mongo_doc = {
        "doc_id": node.ref_doc_id,
        "node_id": node.node_id,
        "text": node.get_text(),
        "embedding": embedding,
        "metadata": node.metadata
    }
    collection.update_one(
        {"node_id": node.node_id},
        {"$set": mongo_doc},
        upsert=True
    )

print(f"Generated embeddings and saved {len(nodes)} chunks to MongoDB.")


In [None]:
# Import necessary classes for MongoDB Atlas vector search index creation
from pymongo.operations import SearchIndexModel

# Define the vector search index
search_index_definition = {
    "mappings": {
        "dynamic": False,  # Disable dynamic schema mapping for safety
        "fields": {
            "embedding": {
                "type": "knnVector",        # Must be knnVector for vector search
                "dimensions": 3072,          # Match embedding vector dimension (adjust if different)
                "similarity": "cosine"      # Similarity metric (cosine is common for embeddings)
            }
        }
    }
}

index_name = "docs_vector_index"

# Check if the index already exists to avoid duplicates (idempotent creation)
try:
    existing_indexes = collection.list_search_indexes()
    existing_names = [idx["name"] for idx in existing_indexes]

    if index_name in existing_names:
        print(f"Search index '{index_name}' already exists, skipping creation.")
    else:
        # Create the new vector search index
        search_index_model = SearchIndexModel(
            definition=search_index_definition,
            name=index_name,
            type="search"
        )
        result = collection.create_search_index(model=search_index_model)
        print(f"Created vector search index '{index_name}': {result}")

except Exception as e:
    print("Error checking or creating search index:")
    print(e)


In [None]:
# Define the query text
# How many productivity tips are listed in the document?
# What is the book’s approach to planning your day?
# What does the book say about email management?
query_text = "What is the summary of this doc"

# Generate the embedding for your query
query_embedding = embed_model.get_text_embedding(query_text)

# Define the aggregation pipeline using Atlas vector search
search_pipeline = [
    {
        "$search": {
            "index": index_name,
            "knnBeta": {
                "vector": query_embedding,   # Query vector for similarity search
                "path": "embedding",         # Field containing stored embeddings
                "k": 3                       # Number of nearest neighbors to retrieve
            }
        }
    },
    {"$limit": 3}  # Limit results to top 3
]

# Execute the aggregation query
results = list(collection.aggregate(search_pipeline))

# Print the texts of the top matching documents
response = []
for result in results:
    response.append(result["text"])
    print(result["text"])
    print("----")

In [None]:
from llama_index.llms.openrouter import OpenRouter
from llama_index.core.llms import ChatMessage
from google.colab import userdata

open_router_api=userdata.get('open_router_api')

llm = OpenRouter(
    api_key=open_router_api,
    max_tokens=256,
    context_window=4096,
    model="gpt-4",
)

In [None]:
# Parse the response through LLM so it is presented back in a NL format
if response:
    # System instruction stays as before
    system_msg = ChatMessage(
        role="system",
        content="You are a helpful assistant. Use the provided context to answer the user's question as clearly and thoroughly as possible."
    )
    # User message asks for JSON structure
    user_msg = ChatMessage(
        role="user",
        content=f"""Context:
{response}

Question: {query_text}

Please answer the question using only the provided context, and return your answer strictly in the following JSON format:

{{
  "answer": <your answer as a string>
}}
Only return valid JSON—do not add commentary or extra explanation outside the JSON object.
"""
    )
    chat_history = [system_msg, user_msg]
else:
    print("No matching documents found.")

resp = llm.chat(chat_history)
print(resp)


In [None]:
start_record_index = 20 # Starting from the first record
num_records_to_fetch = 25 # Fetch 5 records

# Query MongoDB to get a range of records
# The skip() method is used to skip a specified number of documents,
# and the limit() method is used to restrict the number of documents returned.
records_cursor = collection.find({}).skip(start_record_index).limit(num_records_to_fetch)

print(f"Fetching records from index {start_record_index} to {start_record_index + num_records_to_fetch - 1}:")
for i, record in enumerate(records_cursor):
    print(f"Record {start_record_index + i}:")
    print(f"  Node ID: {record.get('node_id')}")
    print(f"  Text: {record.get('text')}...")
    print("------------------")