### Data Ingestion

In [39]:
import os
import hashlib
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
import sys

In [40]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "../data/pdfs/Clod_Computing-Unit-IV[1].pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

61


In [41]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from firestore_loader import FirestoreInterviewLoader
# 1. Load PDFs
file_path = "../data/pdfs/Clod_Computing-Unit-IV[1].pdf"
pdf_loader = PyPDFLoader(file_path)
pdf_docs = pdf_loader.load()
# 2. Load Firestore Data
SERVICE_KEY_PATH = "../serviceAccountKey.json" 
COLLECTION_NAME = "interviews" 
print("Loading Firestore data...")
try:
    firestore_loader = FirestoreInterviewLoader(SERVICE_KEY_PATH, COLLECTION_NAME)
    firestore_docs = firestore_loader.load()
except Exception as e:
    print(f"Error loading Firestore data: {e}")
    firestore_docs = []
print(len(firestore_docs))

Loading Firestore data...
Connecting to Firestore collection: interviews
Successfully loaded 3 documents from Firestore.
3


In [42]:
docs = docs + firestore_docs

### Chunking of the document

In [43]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [54]:
chunks=split_documents(docs)
chunks

Split 64 documents into 70 chunks

Example chunk:
Content: Computing Grids vs. Clouds –Key 
Differences...
Metadata: {'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® PowerPoint® 2016', 'creationdate': '2025-12-02T13:41:59+00:00', 'moddate': '2025-12-02T13:42:00+00:00', 'source': '../data/pdfs/Clod_Computing-Unit-IV[1].pdf', 'total_pages': 61, 'page': 0, 'page_label': '1'}


[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® PowerPoint® 2016', 'creationdate': '2025-12-02T13:41:59+00:00', 'moddate': '2025-12-02T13:42:00+00:00', 'source': '../data/pdfs/Clod_Computing-Unit-IV[1].pdf', 'total_pages': 61, 'page': 0, 'page_label': '1'}, page_content='Computing Grids vs. Clouds –Key \nDifferences'),
 Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® PowerPoint® 2016', 'creationdate': '2025-12-02T13:41:59+00:00', 'moddate': '2025-12-02T13:42:00+00:00', 'source': '../data/pdfs/Clod_Computing-Unit-IV[1].pdf', 'total_pages': 61, 'page': 1, 'page_label': '2'}, page_content='1. Workflow Management\n• Computing Grids\n• Workflows are scientific, batch-oriented, and often complex.\n• Workflow scheduling is manual or semi-automated.\n• Designed for long-running jobs in research and HPC environments.\n• Workflow execution depends on resource availability across multiple \norganizations.\n• Cloud Computing\n• Workflows are ser

### Embedding and VectorDB

In [45]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f'Trying to load model: {self.model_name}')
            self.model = SentenceTransformer(self.model_name)
            print(f'Successfully loaded model: {self.model_name}')
        except Exception as e:
            print(f'Failed to load model: {self.model_name}')
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

embedding_manager = EmbeddingManager()
print(embedding_manager.model_name)

Trying to load model: all-MiniLM-L6-v2
Successfully loaded model: all-MiniLM-L6-v2
all-MiniLM-L6-v2


In [47]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            #Line to reset the document
            #self.client.delete_collection(self.collection_name)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID based on the content to prevent duplicates
            content_hash = hashlib.md5(doc.page_content.encode('utf-8')).hexdigest()
            doc_id = f"doc_{content_hash}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x15ff1c27610>

In [48]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 70 texts...


Batches: 100%|██████████| 3/3 [00:02<00:00,  1.23it/s]


Generated embeddings with shape: (70, 384)
Adding 70 documents to vector store...
Successfully added 70 documents to vector store
Total documents in collection: 70


In [49]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [50]:
rag_retriever.retrieve("What is the difference between grid computing and cloud computing?")

Retrieving documents for query: 'What is the difference between grid computing and cloud computing?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 13.47it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_a289bcada505046b99923bd6f6eee0d6',
  'content': 'Computing Grids vs. Clouds –Key \nDifferences',
  'metadata': {'total_pages': 61,
   'creator': 'Microsoft® PowerPoint® 2016',
   'moddate': '2025-12-02T13:42:00+00:00',
   'doc_index': 0,
   'page_label': '1',
   'creationdate': '2025-12-02T13:41:59+00:00',
   'page': 0,
   'source': '../data/pdfs/Clod_Computing-Unit-IV[1].pdf',
   'content_length': 44,
   'producer': 'www.ilovepdf.com'},
  'similarity_score': 0.6890882551670074,
  'distance': 0.31091174483299255,
  'rank': 1},
 {'id': 'doc_08ae7c1ccb87d86556f3f90c3a39cb70',
  'content': '4. Availability\n• Computing Grids\n• Availability depends on voluntary, distributed resources.\n• Resources may be unreliable or go offline without notice.\n• No strict SLAs; grid nodes may come from different administrative domains.\n• Cloud Computing\n• Provides high availability backed by commercial SLAs (99.9%+).\n• Redundant datacenters across regions/zones.\n• Automated failover, re

### RAG integration with LLM

In [51]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")
llm=ChatGroq(groq_api_key=groq_api_key,model_name="groq/compound-mini",temperature=0.1,max_tokens=1024)

In [56]:
# --- Enhanced RAG Pipeline Features ---
def rag(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag("What are some interview questions for a Senior Frontend role?", rag_retriever, llm, top_k=10, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'What are some interview questions for a Senior Frontend role?'
Top K: 10, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 12.40it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





Answer: **Senior Front‑end (React + TypeScript) interview – sample questions**

1. **React fundamentals**  
   - What are the main differences between class components and functional components? When would you still use a class component?

2. **Hooks & typing**  
   - How do you type `useState`, `useReducer`, and custom hooks in TypeScript? Provide an example.

3. **Performance optimization**  
   - Explain how `React.memo`, `useMemo`, and `useCallback` work. How do you decide which one to apply in a real‑world component?

4. **State management**  
   - Compare Redux Toolkit, Zustand, and React Query for global state / server‑state handling. When would you choose one over the others?

5. **Component design**  
   - When would you prefer `interface` vs `type` for component props? Discuss pros/cons and any edge cases.

6. **Rendering patterns**  
   - What are React Server Components and how do they affect data fetching and bundle size?

7. **Testing strategy**  
   - How do you test a c