# Lab 6: Production Readiness
## Learning Objectives
By the end of this lab, you will:
- Benchmark HNSW parameters systematically (M, ef_construction) using ChromaDB
- Understand quantization trade-offs (SQ8, PQ) for scaling
- Implement a zero-downtime reindexing strategy with collection aliasing
- Estimate vector database costs with growth projections
## Setup

In [None]:
!uv pip install chromadb numpy matplotlib -q

## Part 1: HNSW Benchmarking

HNSW (Hierarchical Navigable Small World) is the engine behind most vector databases. In this section, we move from simulation to real benchmarking using **ChromaDB**.

First, let's initialize a persistent ChromaDB client so we can "see" the data on disk.

In [None]:
import chromadb
import os

# Initialize persistent client
db_path = "./chroma_db_files"
client = chromadb.PersistentClient(path=db_path)

# Create a sample collection to "see" it working
sample_coll = client.get_or_create_collection(name="verify_chroma")
sample_coll.add(
    embeddings=[[0.1] * 384],
    ids=["verify_doc"]
)

print(f"ChromaDB initialized at: {os.path.abspath(db_path)}")
print(f"Collections: {client.list_collections()}")
print(f"Verification document: {sample_coll.get()}")

### Systematic Parameter Sweeping

Now we benchmark how `hnsw:M` and `hnsw:construction_ef` impact search latency and recall. We calculate **real recall** against a brute-force baseline.

In [None]:
import numpy as np
import time
import uuid

def benchmark_chroma_hnsw(n_vectors, dim, M_values, ef_construction_values):
    """Benchmark real ChromaDB HNSW parameters with real recall calculation."""
    results = []
    
    # Generate test data (normalized for cosine)
    vectors = np.random.randn(n_vectors, dim).astype('float32')
    vectors /= np.linalg.norm(vectors, axis=1, keepdims=True)
    queries = np.random.randn(50, dim).astype('float32')
    queries /= np.linalg.norm(queries, axis=1, keepdims=True)
    
    # 0. Build Brute-Force Baseline for Recall
    print(f"Calculating brute-force baseline for {len(queries)} queries...")
    ground_truth = []
    for q in queries:
        similarities = np.dot(vectors, q)
        top_k_idx = np.argsort(similarities)[::-1][:10]
        ground_truth.append(set(top_k_idx.astype(str)))
    
    print(f"Starting ChromaDB benchmark on {n_vectors} vectors...")
    
    for M in M_values:
        for efc in ef_construction_values:
            coll_name = f"bench_{M}_{efc}_{uuid.uuid4().hex[:8]}"
            
            # 1. Measure Build Time
            start_build = time.perf_counter()
            collection = client.create_collection(
                name=coll_name,
                metadata={
                    "hnsw:space": "cosine",
                    "hnsw:M": M,
                    "hnsw:construction_ef": efc
                }
            )
            
            collection.add(
                embeddings=vectors.tolist(),
                ids=[str(i) for i in range(n_vectors)]
            )
            build_time = time.perf_counter() - start_build
            
            # 2. Measure Search Latency & Recall
            search_times = []
            hits = 0
            for i, q in enumerate(queries):
                start_q = time.perf_counter()
                res = collection.query(query_embeddings=[q.tolist()], n_results=10)
                search_times.append((time.perf_counter() - start_q) * 1000)
                
                # Calculate real recall
                retrieved_ids = set(res['ids'][0])
                hits += len(retrieved_ids.intersection(ground_truth[i]))
            
            avg_recall = hits / (len(queries) * 10)
            
            # 3. Rough Memory Calculation
            memory_mb = (n_vectors * dim * 4 + n_vectors * M * 8) / (1024**2)
            
            results.append({
                'M': M, 
                'ef_construction': efc,
                'search_ms': np.mean(search_times),
                'build_time_s': build_time,
                'memory_mb': memory_mb,
                'recall_at_10': avg_recall
            })
            
            client.delete_collection(coll_name)
            print(f"  - Config [M={M}, efc={efc}] -> Search: {np.mean(search_times):.2f}ms, Recall: {avg_recall:.3f}")
            
    return results

M_values = [16, 32, 64]
ef_construction_values = [100, 200]
results = benchmark_chroma_hnsw(2000, 384, M_values, ef_construction_values)

# Display results table
print(f"\n{'M':>4} {'ef_const':>10} {'Search(ms)':>11} {'Recall@10':>10} {'Memory(MB)':>11}")
print("-" * 55)
for r in results:
    print(f"{r['M']:>4} {r['ef_construction']:>10} {r['search_ms']:>11.2f} {r['recall_at_10']:>10.3f} {r['memory_mb']:>11.1f}")

## Part 2: Visualize Trade-offs

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
colors = {16: '#00C9A7', 32: '#FF7A5C', 64: '#1C355E'}

# Plot 1: Speed vs Accuracy
for M in M_values:
    subset = [r for r in results if r['M'] == M]
    subset.sort(key=lambda x: x['search_ms'])
    ax1.plot([r['search_ms'] for r in subset], [r['recall_at_10'] for r in subset],              'o-', color=colors[M], label=f'M={M}', markersize=8)

ax1.set_xlabel('Search Latency (ms)', fontsize=12)
ax1.set_ylabel('Recall@10', fontsize=12)
ax1.set_title('Speed vs Accuracy (Pareto Curve)', fontsize=14, color='#1C355E')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Memory Footprint
for M in M_values:
    subset = [r for r in results if r['M'] == M]
    ax2.bar(str(M), subset[0]['memory_mb'], color=colors[M])

ax2.set_xlabel('M Value', fontsize=12)
ax2.set_ylabel('Estimated Memory (MB)', fontsize=12)
ax2.set_title('Index Memory Footprint', fontsize=14, color='#1C355E')

plt.tight_layout()
plt.show()

### Exercise 6.1: The Optimizer
Write a function that selects the config with the highest recall that stays under a latency budget.

In [None]:
def find_optimal_config(results, max_search_ms=10.0, min_recall=0.94):
    """Find the best configuration within constraints."""
    valid = [r for r in results if r['search_ms'] <= max_search_ms and r['recall_at_10'] >= min_recall]
    if valid:
        return max(valid, key=lambda r: r['recall_at_10'])
    else:
        return min(results, key=lambda r: abs(r['search_ms'] - max_search_ms))

optimal = find_optimal_config(results, max_search_ms=15.0, min_recall=0.94)
from tests import checks
checks.check_lab_6_3(optimal)

## Part 3: Quantization Impact

In [None]:
# Memory comparison for 1M vectors at 1536 dimensions
n, d = 1_000_000, 1536
index_types = [
    {"name": "Flat (F32)", "bytes": 4, "recall": 1.0},
    {"name": "SQ8 (1-byte)", "bytes": 1, "recall": 0.98},
    {"name": "PQ (bit-packed)", "bytes": 0.25, "recall": 0.92},
]

print(f"{'Index Type':<15} {'Memory (GB)':>12} {'Recall':>12}")
print("-" * 45)
for idx in index_types:
    mem_gb = (n * d * idx['bytes']) / (1024**3)
    print(f"{idx['name']:<15} {mem_gb:>11.2f} {idx['recall']:>11.1%}")

## Part 4: Zero-Downtime Reindexing (Alias Switch)

In [None]:
class ChromaAliasManager:
    """Simulates a pointer to a live collection."""
    def __init__(self, client):
        self.client = client
        self._live_name = None
        
    def get_live(self):
        return self.client.get_collection(self._live_name) if self._live_name else None
    
    def switch(self, name):
        print(f"  [ALIAS] Pointing 'live' -> '{name}'")
        self._live_name = name

class ReindexOrchestrator:
    def __init__(self, client, alias):
        self.client = client
        self.alias = alias
        
    def reindex(self, data):
        new_name = f"research_assistant_{int(time.time())}"
        print(f"\nStarting Zero-Downtime Reindex...")
        shadow = self.client.create_collection(name=new_name)
        shadow.add(embeddings=data, ids=[str(i) for i in range(len(data))])
        self.alias.switch(new_name)
        print(f"  Atomic switch complete. Live collection: {new_name}")

# Re-use the existing client
alias = ChromaAliasManager(client)
orchestrator = ReindexOrchestrator(client, alias)

# Initial deploy
orchestrator.reindex(np.random.randn(50, 384).tolist())
# Re-deploy
time.sleep(1)
orchestrator.reindex(np.random.randn(50, 384).tolist())

## Part 5: Cost Planning

In [None]:
def estimate_cost(n_vecs, dim=1536, provider="pinecone"):
    rates = {
        "pinecone": {"gb": 0.45, "reads_m": 1.20, "writes_m": 7.0},
        "weaviate": {"gb": 0.20, "reads_m": 0.70, "writes_m": 2.5},
        "self_hosted": {"gb": 0.10, "reads_m": 0.0, "writes_m": 0.0, "base": 100},
    }
    config = rates.get(provider, rates["pinecone"])
    storage_gb = (n_vecs * dim * 4) / (1024**3)
    monthly_cost = storage_gb * config["gb"] + (1.0 * config["reads_m"]) + (0.1 * config["writes_m"])
    monthly_cost += config.get("base", 0)
    return monthly_cost

scales = [100_000, 1_000_000, 10_000_000]
print(f"{'Vectors':>12} {'Pinecone':>12} {'Weaviate':>12} {'Self-Hosted':>12}")
for s in scales:
    p = estimate_cost(s, provider="pinecone")
    w = estimate_cost(s, provider="weaviate")
    sh = estimate_cost(s, provider="self_hosted")
    print(f"{s:>12,} ${p:>10.2f} ${w:>10.2f} ${sh:>11.2f}")

## Reflection
1. Why is an atomic alias switch better than just deleting and re-creating a collection?
2. If your recall requirement is 99%, which HNSW parameter would you prioritize?
3. What is the main cost driver for vector databases (Storage vs. Compute)?