# User-Interactive RAG Pipeline

This notebook provides a complete end-to-end RAG (Retrieval-Augmented Generation) pipeline that:
1. Accepts user input for PDF file path
2. Asks for Google API key
3. Follows the same pipeline as the existing notebooks
4. Provides an interactive Q&A interface

## Required Libraries
Make sure you have the following libraries installed:
- `pdfplumber`
- `google-generativeai`
- `sentence-transformers`
- `chromadb`
- `pickle`
- `os`
- `warnings`

In [None]:
# Import required libraries
import pdfplumber
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import chromadb
import pickle
import os
import warnings
from datetime import datetime
import re

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

## User Input Section
Please provide the required inputs below:

In [None]:
# Get PDF file path from user
print("=== PDF File Input ===")
pdf_path = input("Please enter the full path to your PDF file: ").strip()

# Validate if the file exists
if not os.path.exists(pdf_path):
    print(f"Error: File not found at {pdf_path}")
    print("Please check the path and try again.")
else:
    print(f"✓ PDF file found: {pdf_path}")
    
# Check if it's a PDF file
if not pdf_path.lower().endswith('.pdf'):
    print("Warning: The file doesn't appear to be a PDF. Proceeding anyway...")

In [None]:
# Get Google API key from user
print("\n=== Google API Key Input ===")
print("Please enter your Google Gemini API key.")
print("You can get one from: https://makersuite.google.com/app/apikey")
api_key = input("Enter your Google API key: ").strip()

if not api_key:
    print("Error: API key cannot be empty!")
else:
    print("✓ API key received")
    # Configure the Google AI API
    genai.configure(api_key=api_key)
    print("✓ Google AI API configured successfully")

## Step 1: PDF Text Extraction and Chunking
Extract text from the PDF and split it into manageable chunks.

In [None]:
# Extract text from PDF (following the same approach as 01-Data-Chunking.ipynb)
print("=== Extracting Text from PDF ===")
all_text = " "

try:
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        print(f"Processing {total_pages} pages...")
        
        for i, page in enumerate(pdf.pages, 1):
            page_text = page.extract_text()
            if page_text:
                all_text += page_text + "\n"
                if i % 10 == 0:  # Progress update every 10 pages
                    print(f"Processed {i}/{total_pages} pages")
                    
    print(f"✓ Successfully extracted text from {total_pages} pages")
    print(f"Total text length: {len(all_text)} characters")
    
except Exception as e:
    print(f"Error extracting text from PDF: {str(e)}")
    raise

In [None]:
# Split text into chunks (following the same approach as existing notebooks)
print("\n=== Chunking Text ===")

# Simple chunking by paragraphs and sentences
# This follows the same pattern as the original notebooks
def chunk_text(text, max_chunk_size=1000):
    """Split text into chunks of approximately max_chunk_size characters."""
    chunks = []
    
    # Split by double newlines first (paragraphs)
    paragraphs = text.split('\n\n')
    
    current_chunk = ""
    for paragraph in paragraphs:
        paragraph = paragraph.strip()
        if not paragraph:
            continue
            
        # If adding this paragraph would exceed max_chunk_size, save current chunk
        if len(current_chunk) + len(paragraph) > max_chunk_size and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph
        else:
            current_chunk += (" " if current_chunk else "") + paragraph
    
    # Add the last chunk if it exists
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    
    return chunks

# Create chunks
chunks = chunk_text(all_text)
print(f"✓ Created {len(chunks)} chunks")
print(f"Average chunk size: {sum(len(chunk) for chunk in chunks) // len(chunks)} characters")

# Show a sample chunk
if chunks:
    print(f"\nSample chunk (first 200 chars): {chunks[0][:200]}...")

## Step 2: Generate Embeddings
Create embeddings for all text chunks using SentenceTransformer.

In [None]:
# Create embeddings (following the same approach as 02-Embeddings.ipynb)
print("=== Creating Embeddings ===")
print("Loading SentenceTransformer model...")

# Load the same model as used in the original notebooks
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ SentenceTransformer model loaded")

print(f"Generating embeddings for {len(chunks)} chunks...")
embeddings = model.encode(chunks)
print(f"✓ Generated embeddings with shape: {embeddings.shape}")

# Save embeddings and chunks for future use (optional)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
chunks_filename = f"user_chunks_{timestamp}.pkl"
embeddings_filename = f"user_embeddings_{timestamp}.pkl"

with open(chunks_filename, 'wb') as f:
    pickle.dump(chunks, f)
    
with open(embeddings_filename, 'wb') as f:
    pickle.dump(embeddings, f)
    
print(f"✓ Saved chunks to {chunks_filename}")
print(f"✓ Saved embeddings to {embeddings_filename}")

## Step 3: Store in Vector Database
Store the embeddings in ChromaDB for efficient retrieval.

In [None]:
# Setup ChromaDB (following the same approach as 04-VectorDB-Chroma.ipynb)
print("=== Setting up ChromaDB ===")

# Create a unique database path for this session
db_path = f"user_chroma_db_{timestamp}"
chroma_client = chromadb.PersistentClient(path=db_path)
print(f"✓ ChromaDB client created with path: {db_path}")

# Create collection
collection_name = f"user_collection_{timestamp}"
collection = chroma_client.get_or_create_collection(name=collection_name)
print(f"✓ Collection '{collection_name}' created")

# Add documents to the collection
print(f"Adding {len(chunks)} documents to the collection...")

# Prepare data for ChromaDB
ids = [f"chunk_{i}" for i in range(len(chunks))]
metadatas = [{"chunk_id": i, "source": os.path.basename(pdf_path)} for i in range(len(chunks))]

# Add to collection
collection.add(
    embeddings=embeddings.tolist(),
    documents=chunks,
    ids=ids,
    metadatas=metadatas
)

print(f"✓ Successfully added {collection.count()} documents to ChromaDB")

## Step 4: Interactive Q&A System
Query the system with questions and get AI-powered responses.

In [None]:
# Setup the Q&A system (following the same approach as 05-LLM-API-Retrieval.ipynb)
print("=== Setting up Q&A System ===")

def answer_question(question, num_results=3):
    """Answer a question using the RAG pipeline."""
    
    # Generate embedding for the question
    question_embedding = model.encode(question)
    
    # Query the vector database
    results = collection.query(
        query_embeddings=[question_embedding],
        n_results=num_results
    )
    
    # Get the retrieved chunks
    retrieved_chunks = results['documents'][0]
    
    # Create context from retrieved chunks
    context = "\n---\n".join(retrieved_chunks)
    
    # Create prompt for the LLM
    prompt = f"""Context: {context}

Question: {question}

Answer:"""
    
    # Generate response using Google Gemini
    try:
        gemini_model = genai.GenerativeModel('gemini-1.5-flash')
        response = gemini_model.generate_content(prompt)
        
        return {
            'answer': response.text,
            'retrieved_chunks': retrieved_chunks,
            'context': context
        }
    except Exception as e:
        return {
            'error': f"Error generating response: {str(e)}",
            'retrieved_chunks': retrieved_chunks,
            'context': context
        }

print("✓ Q&A system ready!")
print("\nYou can now ask questions about your PDF document.")

In [None]:
# Interactive Q&A loop
print("=== Interactive Q&A Session ===")
print("Ask questions about your PDF document. Type 'quit' to exit.")
print("Type 'help' for available commands.")
print("-" * 50)

while True:
    question = input("\nYour question: ").strip()
    
    if question.lower() in ['quit', 'exit', 'q']:
        print("Thank you for using the RAG pipeline!")
        break
    
    if question.lower() == 'help':
        print("Available commands:")
        print("- Type any question about your PDF document")
        print("- 'quit' or 'exit' or 'q' to exit")
        print("- 'help' to see this message")
        print("- 'stats' to see system statistics")
        continue
    
    if question.lower() == 'stats':
        print(f"System Statistics:")
        print(f"- PDF file: {os.path.basename(pdf_path)}")
        print(f"- Total chunks: {len(chunks)}")
        print(f"- Vector database: {collection.count()} documents")
        print(f"- Collection name: {collection_name}")
        continue
    
    if not question:
        print("Please enter a question.")
        continue
    
    print(f"\nProcessing question: {question}")
    print("Searching for relevant information...")
    
    # Get answer
    result = answer_question(question)
    
    if 'error' in result:
        print(f"\n❌ Error: {result['error']}")
        print("\nRetrieved context for reference:")
        print(result['context'][:500] + "...")
    else:
        print(f"\n🤖 Answer: {result['answer']}")
        
        # Optionally show retrieved chunks
        show_chunks = input("\nShow retrieved chunks? (y/n): ").strip().lower()
        if show_chunks == 'y':
            print("\n📄 Retrieved chunks:")
            for i, chunk in enumerate(result['retrieved_chunks'], 1):
                print(f"\nChunk {i}:")
                print(chunk[:300] + "..." if len(chunk) > 300 else chunk)
    
    print("-" * 50)

## Test with Sample Questions
If you want to test the system with some sample questions, run the cell below:

In [None]:
# Test with sample questions
print("=== Testing with Sample Questions ===")

sample_questions = [
    "What is the main topic of this document?",
    "Can you summarize the key points?",
    "What are the most important concepts mentioned?"
]

for i, question in enumerate(sample_questions, 1):
    print(f"\n{i}. Question: {question}")
    print("   Processing...")
    
    result = answer_question(question)
    
    if 'error' in result:
        print(f"   ❌ Error: {result['error']}")
    else:
        print(f"   🤖 Answer: {result['answer'][:200]}...")
    
    print("-" * 40)

## Cleanup (Optional)
Remove temporary files if needed:

In [None]:
# Optional cleanup
import shutil

cleanup = input("Do you want to remove temporary files? (y/n): ").strip().lower()

if cleanup == 'y':
    try:
        # Remove pickle files
        if os.path.exists(chunks_filename):
            os.remove(chunks_filename)
            print(f"✓ Removed {chunks_filename}")
        
        if os.path.exists(embeddings_filename):
            os.remove(embeddings_filename)
            print(f"✓ Removed {embeddings_filename}")
        
        # Remove ChromaDB directory
        if os.path.exists(db_path):
            shutil.rmtree(db_path)
            print(f"✓ Removed {db_path}")
        
        print("✓ Cleanup completed")
    except Exception as e:
        print(f"Error during cleanup: {str(e)}")
else:
    print("Temporary files preserved:")
    print(f"- Chunks: {chunks_filename}")
    print(f"- Embeddings: {embeddings_filename}")
    print(f"- ChromaDB: {db_path}")