# Multi-Document Support Demo

This notebook demonstrates how to use the enhanced multi-document support in the RAG system.

In [None]:
from aimakerspace.text_utils import PDFFileLoader, CharacterTextSplitter, RecursiveTextSplitter
from aimakerspace.vectordatabase import VectorDatabase
import asyncio
import nest_asyncio
nest_asyncio.apply()

## Step 1: Load Multiple Documents with Metadata

The `PDFFileLoader` can now load multiple documents and track metadata for each.

In [None]:
# Option 1: Load a single document
pdf_loader = PDFFileLoader("data/Stone Ridge 2025 Investor Letter.pdf")
docs_with_metadata = pdf_loader.load_documents_with_metadata()

print(f"Loaded {len(docs_with_metadata)} document(s)")
for doc, meta in docs_with_metadata:
    print(f"  - {meta['source']}: {len(doc)} chars, {meta['num_pages']} pages")

In [None]:
# Option 2: Load multiple documents from a directory
# pdf_loader = PDFFileLoader("data/investor_letters/")  # Will load all PDFs
# docs_with_metadata = pdf_loader.load_documents_with_metadata()

# print(f"Loaded {len(docs_with_metadata)} document(s)")
# for doc, meta in docs_with_metadata:
#     print(f"  - {meta['source']}: {len(doc)} chars, {meta['num_pages']} pages")

## Step 2: Split Documents While Preserving Metadata

The `CharacterTextSplitter` now has a `split_texts_with_metadata()` method that preserves document source information.

In [None]:
# Extract texts and metadata separately
texts = [doc for doc, _ in docs_with_metadata]
metadatas = [meta for _, meta in docs_with_metadata]

# Split with metadata - using CharacterTextSplitter (default)
text_splitter = CharacterTextSplitter()
chunks_with_metadata = text_splitter.split_texts_with_metadata(texts, metadatas)

# OR use RecursiveTextSplitter for better semantic chunks:
# text_splitter = RecursiveTextSplitter()
# chunks_with_metadata = text_splitter.split_texts_with_metadata(texts, metadatas)

print(f"Created {len(chunks_with_metadata)} chunks")
print(f"\nFirst chunk metadata: {chunks_with_metadata[0]['metadata']}")
print(f"First chunk text preview: {chunks_with_metadata[0]['text'][:100]}...")

## Step 2.5: Chunking Strategy Comparison

Let's compare two different chunking strategies to see how they affect chunk quality.

### How RecursiveTextSplitter Works

The `RecursiveTextSplitter` uses a hierarchy of separators:

1. **First** tries to split on `\n\n` (paragraphs)
2. **Then** tries `\n` (lines) if chunks too large
3. **Then** tries `. ` (sentences)
4. **Then** tries ` ` (words)
5. **Finally** splits characters as last resort

This preserves semantic meaning and produces better chunks for retrieval!

In [None]:
### Strategy 1: CharacterTextSplitter (Simple)
# Blindly splits every N characters - may break words/sentences
char_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
char_chunks = char_splitter.split_texts_with_metadata(texts, metadatas)

print("CharacterTextSplitter Results:")
print(f"  Total chunks: {len(char_chunks)}")
print(f"  Avg size: {sum(len(c['text']) for c in char_chunks) / len(char_chunks):.0f} chars")
print(f"\n  Example chunk ending:")
print(f"  ...{char_chunks[5]['text'][-100:]}")

### Strategy 2: RecursiveTextSplitter (Sophisticated)
# Respects paragraphs, sentences, and words
recursive_splitter = RecursiveTextSplitter(chunk_size=500, chunk_overlap=50)
recursive_chunks = recursive_splitter.split_texts_with_metadata(texts, metadatas)

print(f"\n{'='*80}")
print("RecursiveTextSplitter Results:")
print(f"  Total chunks: {len(recursive_chunks)}")
print(f"  Avg size: {sum(len(c['text']) for c in recursive_chunks) / len(recursive_chunks):.0f} chars")
print(f"\n  Example chunk ending:")
print(f"  ...{recursive_chunks[5]['text'][-100:]}")

print(f"\n{'='*80}")
print("Key Difference:")
print("  CharacterTextSplitter: Often cuts mid-word/sentence")
print("  RecursiveTextSplitter: Ends at natural boundaries (better for semantic search)")

## Step 3: Build Vector Database with Metadata

In [None]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("OpenAI API Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [None]:
# Build vector database with metadata
vector_db = VectorDatabase()
vector_db = asyncio.run(vector_db.abuild_from_list_with_metadata(chunks_with_metadata))

print(f"Vector database built with {len(vector_db.vectors)} chunks")
print(f"Metadata stored for {len(vector_db.metadata)} chunks")

## Step 4: Search with Metadata

Now when we search, we can retrieve the document source information!

In [None]:
# Search with metadata
results = vector_db.search_by_text(
    "What is Stone Ridge's investment philosophy?",
    k=3,
    return_metadata=True
)

print("Search Results:\n")
for i, (text, score, metadata) in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"  Source: {metadata.get('source', 'Unknown')}")
    print(f"  Chunk: {metadata.get('chunk_index', '?')}/{metadata.get('total_chunks', '?')}")
    print(f"  Score: {score:.4f}")
    print(f"  Text preview: {text[:100]}...")
    print()

## Step 5: Enhanced RAG Pipeline with Document Attribution

In [None]:
from aimakerspace.openai_utils.prompts import UserRolePrompt, SystemRolePrompt
from aimakerspace.openai_utils.chatmodel import ChatOpenAI

class MultiDocumentRAGPipeline:
    """Enhanced RAG pipeline that tracks document sources."""

    def __init__(self, llm: ChatOpenAI, vector_db_retriever: VectorDatabase, 
                 response_style: str = "detailed", include_scores: bool = False) -> None:
        self.llm = llm
        self.vector_db_retriever = vector_db_retriever
        self.response_style = response_style
        self.include_scores = include_scores

    def run_pipeline(self, user_query: str, k: int = 4, **system_kwargs) -> dict:
        # Retrieve relevant contexts WITH metadata
        context_list = self.vector_db_retriever.search_by_text(
            user_query, k=k, return_metadata=True
        )
        
        context_prompt = ""
        similarity_scores = []
        sources = []
        
        for i, (context, score, metadata) in enumerate(context_list, 1):
            source = metadata.get('source', 'Unknown')
            chunk_info = f"{metadata.get('chunk_index', '?')+1}/{metadata.get('total_chunks', '?')}"
            
            context_prompt += f"[Source {i} - {source} (chunk {chunk_info})]: {context}\n\n"
            similarity_scores.append(f"Source {i}: {score:.3f}")
            sources.append({
                "source": source,
                "chunk_index": metadata.get('chunk_index'),
                "score": float(score)
            })
        
        # Create prompts
        system_template = """You are a helpful assistant that answers questions based on provided context.
        Always cite which source document your information comes from.
        Keep responses {response_style}."""
        
        user_template = """Context Information:
{context}

Question: {user_query}

Please provide your answer and cite which source documents you used."""
        
        system_prompt = SystemRolePrompt(system_template)
        user_prompt = UserRolePrompt(user_template)
        
        formatted_system = system_prompt.create_message(response_style=self.response_style)
        formatted_user = user_prompt.create_message(
            context=context_prompt.strip(),
            user_query=user_query
        )

        return {
            "response": self.llm.run([formatted_system, formatted_user]),
            "context": context_list,
            "sources": sources,
            "context_count": len(context_list),
            "similarity_scores": similarity_scores if self.include_scores else None
        }

In [None]:
# Run the enhanced pipeline
chat_openai = ChatOpenAI()
rag_pipeline = MultiDocumentRAGPipeline(
    vector_db_retriever=vector_db,
    llm=chat_openai,
    response_style="detailed",
    include_scores=True
)

result = rag_pipeline.run_pipeline(
    "What are the key investment themes discussed in the letter?",
    k=3
)

print("Response:")
print(result['response'])
print("\n" + "="*80)
print("\nSources Used:")
for source in result['sources']:
    print(f"  - {source['source']} (chunk {source['chunk_index']}, score: {source['score']:.3f})")

## Enhanced Features Summary

### 1. Multi-Document Support
- **Document Attribution**: Know which document each answer came from
- **Historical Comparison**: Compare answers across multiple years of investor letters
- **Source Filtering**: Can add filtering by document name or metadata
- **Transparency**: Users can verify sources and context
- **Better Context**: Understand temporal context (e.g., "In the 2024 letter...")

### 2. Advanced Chunking Strategies
- **CharacterTextSplitter**: Simple, fast, predictable chunk sizes
- **RecursiveTextSplitter**: Respects natural language boundaries (paragraphs, sentences, words)
- Choose the right strategy based on your use case:
  - Use RecursiveTextSplitter for better semantic search quality
  - Use CharacterTextSplitter for speed and simplicity