# Assignment 4: LLM + LangChain + MongoDB NoSQL Integration
## Objective: Create a toolchain with LLM, LangChain, and MongoDB to store and query documents through an LLM

## Setup: Load Dependencies and Initialize MongoDB Connection

In [1]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import numpy as np
from datetime import datetime
import json

# Load environment variables
load_dotenv()

# Print confirmation
print("‚úì All dependencies imported successfully")

‚úì All dependencies imported successfully


## MongoDB Configuration and Connection

In [2]:
# MongoDB Connection Configuration
MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://localhost:27017")
DATABASE_NAME = "assignment4_db"
DOCUMENTS_COLLECTION = "documents"
QUERIES_COLLECTION = "queries"

# Create MongoDB client
try:
    mongo_client = MongoClient(MONGODB_URI, serverSelectionTimeoutMS=5000)
    # Test connection
    mongo_client.admin.command('ping')
    print(f"‚úì Connected to MongoDB at {MONGODB_URI}")
except Exception as e:
    print(f"‚ùå Error connecting to MongoDB: {e}")
    print(f"Make sure MongoDB is running and accessible at {MONGODB_URI}")

# Get database
db = mongo_client[DATABASE_NAME]

# Get or create collections
docs_collection = db[DOCUMENTS_COLLECTION]
queries_collection = db[QUERIES_COLLECTION]

# Create indexes for better query performance
docs_collection.create_index([("document_id", 1)])
docs_collection.create_index([("source", 1)])
queries_collection.create_index([("timestamp", -1)])

print(f"‚úì Using database: {DATABASE_NAME}")
print(f"‚úì Collections: {DOCUMENTS_COLLECTION}, {QUERIES_COLLECTION}")

‚úì Connected to MongoDB at mongodb://localhost:27017
‚úì Using database: assignment4_db
‚úì Collections: documents, queries


## Initialize LLM and Embedding Model

In [3]:
# Initialize Google LLM (same as capstone1)
google_api_key = os.getenv("GOOGLE_API_KEY")
google_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0,
    google_api_key=google_api_key
)

# Initialize Embedding Model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("‚úì Google LLM initialized")
print("‚úì SentenceTransformer embedding model loaded")

‚úì Google LLM initialized
‚úì SentenceTransformer embedding model loaded


## Load and Process PDF Documents

In [4]:
# Load PDF document (same as capstone1)
try:
    loader = PyPDFLoader("dataset/AI Agents guidebook.pdf")
    documents = loader.load()
    print(f"‚úì Loaded {len(documents)} pages from PDF")
except FileNotFoundError:
    print("‚ùå PDF file not found. Using sample text data instead.")
    documents = []
    sample_texts = [
        "Multi-agent systems consist of multiple intelligent agents that interact to solve complex problems.",
        "AI agents use reasoning and planning to accomplish tasks autonomously.",
        "Deep research systems combine web search, analysis, and report generation in multi-agent workflows."
    ]
    from langchain_core.documents import Document
    documents = [Document(page_content=text, metadata={"source": "sample_data", "page": i}) 
                 for i, text in enumerate(sample_texts)]

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=300
)
chunks = text_splitter.split_documents(documents)
print(f"‚úì Created {len(chunks)} chunks from documents")

Ignoring wrong pointing object 899 0 (offset 0)


‚úì Loaded 117 pages from PDF
‚úì Created 122 chunks from documents


## Embed Chunks and Store in MongoDB

In [5]:
# Encode chunks using embedding model
print("Encoding chunks...")
encoded_chunks = [embedding_model.encode(chunk.page_content) for chunk in chunks]
print(f"‚úì Encoded {len(encoded_chunks)} chunks")

# Clear existing documents
docs_collection.delete_many({})

# Store chunks with embeddings in MongoDB
documents_to_insert = []
for i, chunk in enumerate(chunks):
    doc = {
        "document_id": f"chunk_{i}",
        "content": chunk.page_content,
        "embedding": encoded_chunks[i].tolist(),  # Store as list for MongoDB
        "source": chunk.metadata.get("source", "unknown"),
        "page": chunk.metadata.get("page", 0),
        "chunk_index": i,
        "created_at": datetime.utcnow()
    }
    documents_to_insert.append(doc)

# Insert all documents
result = docs_collection.insert_many(documents_to_insert)
print(f"‚úì Stored {len(result.inserted_ids)} documents in MongoDB")
print(f"‚úì Embedding dimension: {len(encoded_chunks[0])} dimensions")

Encoding chunks...
‚úì Encoded 122 chunks
‚úì Stored 122 documents in MongoDB
‚úì Embedding dimension: 384 dimensions


## Semantic Search Function with MongoDB

In [6]:
def search_mongodb(query_text, top_k=3):
    """
    Search MongoDB documents using semantic similarity
    """
    # Encode the query
    query_embedding = embedding_model.encode(query_text)
    query_embedding_list = query_embedding.tolist()
    
    # Retrieve all documents and calculate similarity
    all_docs = list(docs_collection.find({}, {"embedding": 1, "content": 1, "source": 1, "page": 1, "_id": 0}))
    
    # Calculate cosine similarity
    similarities = []
    for doc in all_docs:
        doc_embedding = np.array(doc["embedding"])
        query_emb = np.array(query_embedding_list)
        
        # Cosine similarity
        similarity = np.dot(doc_embedding, query_emb) / (np.linalg.norm(doc_embedding) * np.linalg.norm(query_emb))
        similarities.append({
            "content": doc["content"],
            "source": doc["source"],
            "page": doc["page"],
            "similarity": float(similarity)
        })
    
    # Sort by similarity and get top_k
    similarities.sort(key=lambda x: x["similarity"], reverse=True)
    results = similarities[:top_k]
    
    return results, query_embedding

# Test the search function
test_query = "What are AI agents?"
results, query_emb = search_mongodb(test_query, top_k=3)

print(f"\n{'='*80}")
print(f"üîç Search Results for: {test_query}")
print(f"{'='*80}")
for i, result in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Similarity: {result['similarity']:.4f}")
    print(f"Source: {result['source']} (Page {result['page']})")
    print(f"Content: {result['content'][:200]}...")


üîç Search Results for: What are AI agents?

Result 1:
Similarity: 0.7394
Source: dataset/AI Agents guidebook.pdf (Page 34)
Content: DailyDoseofDS.com 
 
 
AI Agents 
Projects  
 
 
 
 
 
 
 
34...

Result 2:
Similarity: 0.7243
Source: dataset/AI Agents guidebook.pdf (Page 7)
Content: DailyDoseofDS.com 
 
 
Here, the AI agents not only execute the research process end-to-end but also 
self-reÔ¨Åne their outputs, ensuring the Ô¨Ånal report is comprehensive, up-to-date, 
and well-structu...

Result 3:
Similarity: 0.6910
Source: dataset/AI Agents guidebook.pdf (Page 4)
Content: DailyDoseofDS.com 
 
 
AI Agents  
 
4...


## Query Documents through LLM

In [7]:
def query_with_llm(user_query, top_k=3):
    """
    Search MongoDB and generate LLM response using retrieved context
    """
    # Search for relevant documents
    search_results, query_embedding = search_mongodb(user_query, top_k=top_k)
    
    # Build context from search results
    context_text = "\n\n".join([
        f"Context {i+1} (Source: {result['source']}, Similarity: {result['similarity']:.4f}):\n{result['content']}"
        for i, result in enumerate(search_results)
    ])
    
    # Create prompt
    prompt_template = ChatPromptTemplate.from_template("""
You are an expert assistant that answers questions based on provided context.

Context Information:
{context}

User Question: {question}

Please provide a comprehensive answer based on the context above. If the context doesn't contain relevant information, indicate that.
""")
    
    # Generate response
    chain = prompt_template | google_llm | StrOutputParser()
    response = chain.invoke({
        "context": context_text,
        "question": user_query
    })
    
    return {
        "query": user_query,
        "response": response,
        "search_results": search_results,
        "context": context_text
    }

# Example query
user_query = "Explain the concept of multi-agent systems and their workflow"
result = query_with_llm(user_query, top_k=3)

print(f"\n{'='*80}")
print(f"üìù LLM Response")
print(f"{'='*80}")
print(f"Query: {result['query']}\n")
print(f"Response:\n{result['response']}")


üìù LLM Response
Query: Explain the concept of multi-agent systems and their workflow

Response:
Multi-agent systems involve several agents, each with a specific role and task, that work together to achieve a final outcome. These agents can also access tools to complete their tasks. In a multi-agent system, collaboration and feedback exchange are crucial for optimal performance. Instead of a single agent handling all responsibilities, a team of specialized agents can divide tasks and enhance each other's outputs.

The workflow of a multi-agent system can be structured with a manager agent that coordinates multiple sub-agents and iteratively decides on the next steps. Humans can define the hierarchy between agents, their roles, and the tools they can access.

An example of a multi-agent system in action is an AI-powered financial analysis system. In this scenario:
*   One agent is responsible for gathering data.
*   Another agent assesses risk.
*   A third agent builds the strategy.
*

## Store Query Results in MongoDB

In [8]:
# Store the query and response in MongoDB
query_document = {
    "query_id": f"query_{datetime.utcnow().timestamp()}",
    "query_text": user_query,
    "llm_response": result["response"],
    "search_results_count": len(result["search_results"]),
    "top_similarity_score": result["search_results"][0]["similarity"] if result["search_results"] else 0,
    "timestamp": datetime.utcnow(),
    "context_summary": f"Retrieved {len(result['search_results'])} relevant documents"
}

insert_result = queries_collection.insert_one(query_document)
print(f"\n‚úì Query saved to MongoDB")
print(f"  Query ID: {query_document['query_id']}")
print(f"  Timestamp: {query_document['timestamp']}")
print(f"  Search Results Used: {query_document['search_results_count']}")
print(f"  Top Similarity: {query_document['top_similarity_score']:.4f}")


‚úì Query saved to MongoDB
  Query ID: query_1768459835.772808
  Timestamp: 2026-01-15 12:20:35.772824
  Search Results Used: 3
  Top Similarity: 0.7127


## Retrieve Query History from MongoDB

In [9]:
# Retrieve all stored queries from MongoDB
all_queries = list(queries_collection.find({}).sort("timestamp", -1).limit(5))

print(f"\n{'='*80}")
print(f"üìö Query History from MongoDB")
print(f"{'='*80}")
print(f"Total queries stored: {queries_collection.count_documents({})}\n")

for i, query_doc in enumerate(all_queries, 1):
    print(f"Query {i}:")
    print(f"  Query Text: {query_doc.get('query_text')}")
    print(f"  Timestamp: {query_doc.get('timestamp')}")
    print(f"  Search Results Count: {query_doc.get('search_results_count')}")
    print(f"  Top Similarity Score: {query_doc.get('top_similarity_score'):.4f}")
    print(f"  Response Preview: {query_doc.get('llm_response')[:100]}...\n")


üìö Query History from MongoDB
Total queries stored: 1

Query 1:
  Query Text: Explain the concept of multi-agent systems and their workflow
  Timestamp: 2026-01-15 12:20:35.772000
  Search Results Count: 3
  Top Similarity Score: 0.7127
  Response Preview: Multi-agent systems involve several agents, each with a specific role and task, that work together t...



## Database Statistics and Summary

In [10]:
# Get database statistics
documents_count = docs_collection.count_documents({})
queries_count = queries_collection.count_documents({})

print(f"\n{'='*80}")
print(f"üìä MongoDB Database Statistics")
print(f"{'='*80}")
print(f"Database Name: {DATABASE_NAME}")
print(f"MongoDB URI: {MONGODB_URI}")
print(f"\nCollections:")
print(f"  - {DOCUMENTS_COLLECTION}: {documents_count} documents")
print(f"  - {QUERIES_COLLECTION}: {queries_count} queries")
print(f"\nEmbedding Model: SentenceTransformer (all-MiniLM-L6-v2)")
print(f"Embedding Dimension: {len(encoded_chunks[0])} dimensions")
print(f"\nLLM Model: Google Generative AI (Gemini 2.5 Flash Lite)")
print(f"\n‚úì Assignment 4 completed successfully!")


üìä MongoDB Database Statistics
Database Name: assignment4_db
MongoDB URI: mongodb://localhost:27017

Collections:
  - documents: 122 documents
  - queries: 1 queries

Embedding Model: SentenceTransformer (all-MiniLM-L6-v2)
Embedding Dimension: 384 dimensions

LLM Model: Google Generative AI (Gemini 2.5 Flash Lite)

‚úì Assignment 4 completed successfully!


## Advanced: Multiple Query Examples

In [11]:
# Test multiple queries
test_queries = [
    "What is deep research and how does it work?",
    "How do agents collaborate in multi-agent systems?"
]

for test_q in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: {test_q}")
    print(f"{'='*80}")
    
    result = query_with_llm(test_q, top_k=2)
    
    print(f"\nResponse:\n{result['response'][:500]}...\n")
    
    # Store in MongoDB
    query_document = {
        "query_id": f"query_{datetime.utcnow().timestamp()}",
        "query_text": test_q,
        "llm_response": result["response"],
        "search_results_count": len(result["search_results"]),
        "top_similarity_score": result["search_results"][0]["similarity"] if result["search_results"] else 0,
        "timestamp": datetime.utcnow(),
        "context_summary": f"Retrieved {len(result['search_results'])} relevant documents"
    }
    queries_collection.insert_one(query_document)
    print(f"‚úì Query stored in MongoDB")


Query: What is deep research and how does it work?

Response:
Deep research, as described in the provided context, is a feature that helps users obtain detailed insights on any topic. It involves a multi-agent system that automates the research process.

Here's how it works based on the context:

1.  **User Query:** The process begins with the user submitting a query.
2.  **Deep Web Search:** A web search agent then conducts a deep web search using a platform called Linkup.
3.  **Verification and Deduplication:** A research analyst agent verifies the searc...

‚úì Query stored in MongoDB

Query: How do agents collaborate in multi-agent systems?

Response:
In multi-agent systems, agents collaborate by exchanging feedback and working together to achieve a common goal. Instead of a single agent performing all tasks, specialized agents can divide responsibilities and enhance each other's outputs. This is exemplified in an AI-powered financial analysis system where one agent gathers data, 

## Cleanup: Close MongoDB Connection

In [12]:
# Close MongoDB connection
mongo_client.close()
print("‚úì MongoDB connection closed")
print("\n‚úì Assignment 4 completed successfully!")
print(f"\nSummary:")
print(f"  - Loaded and processed PDF documents")
print(f"  - Created embeddings with SentenceTransformer")
print(f"  - Stored {documents_count} documents in MongoDB")
print(f"  - Processed {queries_count} queries through LLM")
print(f"  - Used Google Generative AI for LLM responses")
print(f"  - Implemented semantic search with MongoDB")

‚úì MongoDB connection closed

‚úì Assignment 4 completed successfully!

Summary:
  - Loaded and processed PDF documents
  - Created embeddings with SentenceTransformer
  - Stored 122 documents in MongoDB
  - Processed 1 queries through LLM
  - Used Google Generative AI for LLM responses
  - Implemented semantic search with MongoDB
