In [None]:
# Setup
import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Verify API keys
if not os.getenv('OPENAI_API_KEY'):
    print("‚ö†Ô∏è Warning: OPENAI_API_KEY not set!")
else:
    print("‚úÖ Environment configured successfully!")

## Step 1: Document Loading

First, let's load some sample documents. We support PDF, Word, text, and images.

In [None]:
from src.ingestion.loaders.document_loaders import load_document, LoaderFactory

# Create a sample document
sample_text = """
COMPANY VACATION POLICY

1. VACATION ENTITLEMENT

All full-time employees are entitled to paid vacation time based on their length of service:
- 0-2 years of service: 10 working days per year
- 2-5 years of service: 15 working days per year  
- 5+ years of service: 20 working days per year

2. REQUESTING VACATION

Vacation requests must be submitted through the HR portal at least 2 weeks in advance.
Requests are subject to manager approval based on team coverage needs.

3. CARRYOVER POLICY

Unused vacation days may be carried over to the following year, up to a maximum of 5 days.
Days exceeding this limit will be forfeited.

4. CONTACT

For questions about vacation policy, contact HR at hr@company.com or ext. 5555.
"""

# Save sample document
sample_file = project_root / "data" / "sample_documents" / "hr_policies" / "vacation_policy.txt"
sample_file.parent.mkdir(parents=True, exist_ok=True)
sample_file.write_text(sample_text)

# Load the document
doc_data = load_document(str(sample_file))

print(f"üìÑ Loaded: {doc_data['metadata']['filename']}")
print(f"üìä Size: {doc_data['metadata']['file_size']} bytes")
print(f"\nüìù Content preview:\n{doc_data['content'][:200]}...")

## Step 2: Chunking Strategies

Let's compare different chunking strategies to see how they split our document.

In [None]:
from src.ingestion.chunking.chunking_strategies import chunk_document

# Test different strategies
strategies = [
    ('fixed', {'chunk_size': 200, 'chunk_overlap': 20}),
    ('recursive', {'chunk_size': 300, 'chunk_overlap': 50}),
]

results = {}

for strategy, params in strategies:
    chunks = chunk_document(
        doc_data['content'],
        strategy=strategy,
        metadata={'doc_id': 'vacation_policy', 'source': 'hr'},
        **params
    )
    results[strategy] = chunks
    
    print(f"\n{'='*60}")
    print(f"Strategy: {strategy.upper()} ({params})")
    print(f"{'='*60}")
    print(f"Total chunks: {len(chunks)}")
    print(f"Avg chunk size: {sum(len(c.content) for c in chunks) / len(chunks):.0f} chars")
    
    print(f"\nFirst 2 chunks:")
    for i, chunk in enumerate(chunks[:2]):
        print(f"\n--- Chunk {i+1} ({len(chunk.content)} chars) ---")
        print(chunk.content[:150] + "...")

## Step 3: Embeddings

Convert text chunks into vector embeddings for semantic search.

In [None]:
from src.embeddings.providers.embedding_providers import EmbeddingFactory

# Create embedding provider
# Try 'local' if you don't have OpenAI API key
embedder = EmbeddingFactory.create(
    provider='openai',  # or 'local' for free option
    model='text-embedding-3-small'
)

print(f"üìä Embedding model: {embedder.model_name}")
print(f"üìè Embedding dimension: {embedder.dimension}")

# Embed a sample text
sample_embedding = embedder.embed_text("What is the vacation policy?")
print(f"\n‚úÖ Embedding created: {len(sample_embedding)} dimensions")
print(f"First 5 values: {sample_embedding[:5]}")

In [None]:
# Embed all chunks (using recursive strategy)
chunks = results['recursive']
chunk_texts = [chunk.content for chunk in chunks]

print(f"Embedding {len(chunk_texts)} chunks...")
embeddings = embedder.embed_batch(chunk_texts)

print(f"‚úÖ Embedded {len(embeddings)} chunks")
print(f"Each embedding: {len(embeddings[0])} dimensions")

## Step 4: Vector Database Storage

Store embeddings in ChromaDB for fast retrieval.

In [None]:
from src.vectordb.chromadb_client import ChromaDBClient

# Initialize ChromaDB
vectordb = ChromaDBClient(
    collection_name="rag_tutorial",
    persist_directory=str(project_root / "chromadb_data" / "tutorial"),
    embedder=embedder
)

print(f"üìä Current documents in DB: {vectordb.count()}")

In [None]:
# Add chunks to vector database
documents = [chunk.content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
ids = [chunk.chunk_id for chunk in chunks]

vectordb.add_documents(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=embeddings
)

print(f"‚úÖ Added {len(documents)} documents to vector database")
print(f"üìä Total documents: {vectordb.count()}")

## Step 5: Test Retrieval

Search for relevant chunks based on a query.

In [None]:
# Test query
query = "How many vacation days do I get after 3 years?"

print(f"üîç Query: {query}\n")

# Search
results = vectordb.search(query, top_k=3)

print(f"Found {len(results['documents'][0])} relevant chunks:\n")

for i in range(len(results['documents'][0])):
    doc = results['documents'][0][i]
    distance = results['distances'][0][i]
    similarity = 1 - distance  # Convert distance to similarity
    
    print(f"--- Result {i+1} (Similarity: {similarity:.3f}) ---")
    print(doc)
    print()

## Step 6: Build Complete RAG Pipeline

Now let's use the Basic RAG pattern to answer questions.

In [None]:
from src.rag_patterns.basic_rag import create_basic_rag

# Create RAG system
rag = create_basic_rag(
    vectordb=vectordb,
    model="gpt-4-turbo-preview",  # or "gpt-3.5-turbo" for faster/cheaper
    top_k=3
)

print("‚úÖ RAG system initialized!")

In [None]:
# Ask questions!
questions = [
    "How many vacation days do I get with 3 years of service?",
    "How far in advance should I request vacation?",
    "Can I carry over unused vacation days?",
    "Who do I contact for vacation questions?"
]

for question in questions:
    print(f"\n{'='*70}")
    print(f"‚ùì Q: {question}")
    print(f"{'='*70}")
    
    result = rag.query(question)
    
    print(f"\nüí° Answer:\n{result['answer']}")
    
    print(f"\nüìä Metadata:")
    print(f"  - Tokens used: {result['metadata']['tokens_used']}")
    print(f"  - Sources: {len(result['sources'])}")
    
    print(f"\nüìö Top source (score {result['sources'][0]['score']:.3f}):")
    print(f"  {result['sources'][0]['content'][:150]}...")

## Step 7: Hybrid Search (Advanced)

Combine dense (vector) and sparse (BM25) retrieval for better results.

In [None]:
from src.embeddings.hybrid import create_hybrid_retriever

# Create hybrid retriever
bm25, hybrid = create_hybrid_retriever(
    documents=documents,
    doc_ids=ids,
    metadata_list=metadatas,
    dense_weight=0.7,
    sparse_weight=0.3
)

print("‚úÖ Hybrid retriever created!")

# Test hybrid search
query = "vacation carryover maximum"

# Get sparse results
sparse_results = bm25.search(query, top_k=3)

# Get dense results (from vector DB)
dense_results_raw = vectordb.search(query, top_k=3)

# Convert to SearchResult format
from src.embeddings.hybrid import SearchResult

dense_results = [
    SearchResult(
        doc_id=dense_results_raw['ids'][0][i],
        content=dense_results_raw['documents'][0][i],
        score=1 - dense_results_raw['distances'][0][i],
        metadata=dense_results_raw['metadatas'][0][i],
        source="dense"
    )
    for i in range(len(dense_results_raw['ids'][0]))
]

# Combine
combined_results = hybrid.combine_results(dense_results, sparse_results, top_k=3)

print(f"\nüîç Query: {query}")
print(f"\nüìä Hybrid Results:")
for i, result in enumerate(combined_results, 1):
    print(f"\n{i}. Score: {result.score:.3f} | Source: {result.source}")
    print(f"   {result.content[:100]}...")

## Next Steps

Congratulations! üéâ You've built a complete RAG system.

### Continue Learning:

1. **üìì Next Notebook**: `02_chunking_strategies.ipynb` - Deep dive into chunking
2. **üìì Embedding Comparison**: `03_embedding_comparison.ipynb` - Compare embedding models
3. **üìì RAG Patterns**: `04_rag_patterns.ipynb` - Explore Self-RAG, CRAG, Agentic RAG
4. **üìì Evaluation**: `05_evaluation_metrics.ipynb` - Measure and improve quality

### Explore UI:

```bash
streamlit run ui/app.py
```

### Key Concepts Covered:

- ‚úÖ Document loading and preprocessing
- ‚úÖ Text chunking strategies
- ‚úÖ Embeddings and vector representations
- ‚úÖ Vector database storage and retrieval
- ‚úÖ Basic RAG pipeline
- ‚úÖ Hybrid search (dense + sparse)

### Resources:

- üìñ [RAG Concepts Guide](../docs/concepts/rag_overview.md)
- üìñ [API Documentation](../docs/api/README.md)
- üíª [GitHub Repository](https://github.com/your-repo)