diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..8404926
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,24 @@
+# ── Dockerignore — keep images lean ──────────────────────────────────────────
+.git
+.github
+.venv
+node_modules
+__pycache__
+*.pyc
+.pytest_cache
+.mypy_cache
+.env
+.env.*
+!.env.production
+blob_storage/
+*.md
+!README.md
+.next
+frontend/.next
+frontend/node_modules
+deploy/
+*.egg-info
+dist/
+build/
+.DS_Store
+*.log
diff --git a/.env.production b/.env.production
new file mode 100644
index 0000000..406b40e
--- /dev/null
+++ b/.env.production
@@ -0,0 +1,70 @@
+# ── Production Environment Template ──────────────────────────────────────────
+# Copy to .env on the server and fill in ALL values.
+# Generate secrets with: python -c "import secrets; print(secrets.token_urlsafe(64))"
+
+# ── Docker Hub ────────────────────────────────────────────────────────────────
+DOCKERHUB_USER=            # Your Docker Hub username (e.g. hellonish)
+
+# ── Database (used by docker-compose.prod.yml for POSTGRES_PASSWORD) ────────
+POSTGRES_USER=singularity
+POSTGRES_PASSWORD=         # REQUIRED: generate a strong password
+
+# ── LLM providers — at least one required ───────────────────────────────────
+GROK_API_KEY=              # xAI Grok (primary model)
+GOOGLE_API_KEY=            # Google Gemini (optional)
+DEEPSEEK_API_KEY=          # DeepSeek (optional)
+
+# ── Search & retrieval APIs ─────────────────────────────────────────────────
+TAVILY_API_KEY=            # Tavily — web search
+SERPAPI_API_KEY=           # SerpAPI — web search fallback
+GITHUB_TOKEN=              # GitHub — code search (optional, public repos work)
+GOOGLE_BOOKS_API_KEY=      # Google Books (optional)
+NCBI_EMAIL=                # NCBI PubMed — required by their API policy
+
+# ── Vector Store (Qdrant Cloud) ─────────────────────────────────────────────
+# Using Qdrant Cloud instead of local container to save memory on t3.micro
+QDRANT_LOCATION=https://YOUR_CLUSTER_URL.aws.cloud.qdrant.io   # Qdrant Cloud URL
+QDRANT_API_KEY=            # Qdrant Cloud API key
+QDRANT_FORCE_IN_MEMORY=0
+QDRANT_CONNECT_TIMEOUT=3
+
+# ── Auth ─────────────────────────────────────────────────────────────────────
+JWT_SECRET=                # REQUIRED: generate with openssl rand -hex 64
+JWT_ALGORITHM=HS256
+ACCESS_TOKEN_EXPIRE_MINUTES=15
+REFRESH_TOKEN_EXPIRE_DAYS=30
+
+# BYOK encryption key for user LLM API keys
+# Generate: python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"
+LLM_CREDENTIALS_ENCRYPTION_KEY=
+
+# ── Google OAuth ─────────────────────────────────────────────────────────────
+# Create credentials at: https://console.cloud.google.com/apis/credentials
+# Add your domain to Authorized JavaScript origins and Authorized redirect URIs:
+#   https://YOUR_DOMAIN/api/auth/callback/google
+GOOGLE_CLIENT_ID=
+GOOGLE_CLIENT_SECRET=
+
+# ── NextAuth ─────────────────────────────────────────────────────────────────
+NEXTAUTH_URL=https://YOUR_DOMAIN    # REQUIRED: your domain with https://
+NEXTAUTH_SECRET=                     # REQUIRED: generate with openssl rand -hex 32
+NEXT_PUBLIC_API_URL=https://YOUR_DOMAIN  # Same domain (Caddy proxies to API)
+
+# ── Blob Storage ─────────────────────────────────────────────────────────────
+BLOB_STORE=local
+LOCAL_BLOB_DIR=./blob_storage
+
+# ── Observability ────────────────────────────────────────────────────────────
+SENTRY_DSN=                # Optional: Sentry error tracking
+ENVIRONMENT=production
+
+# ── CORS ─────────────────────────────────────────────────────────────────────
+FRONTEND_URL=https://YOUR_DOMAIN
+CORS_ORIGINS=["https://YOUR_DOMAIN"]
+
+# ── Domain (used by Caddy) ──────────────────────────────────────────────────
+DOMAIN=YOUR_DOMAIN         # REQUIRED: for Caddy SSL cert provisioning
+
+# ── Quotas ───────────────────────────────────────────────────────────────────
+DEFAULT_DAILY_TOKEN_BUDGET=1000000
+MAX_CONCURRENT_JOBS_PER_USER=2
diff --git a/.gitignore b/.gitignore
index d48b924..618eceb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,5 @@ cmd.txt
 
 /how_to_document_code.md
 .gstack/
+
+*.pem
\ No newline at end of file
diff --git a/Dockerfile.frontend b/Dockerfile.frontend
new file mode 100644
index 0000000..2012509
--- /dev/null
+++ b/Dockerfile.frontend
@@ -0,0 +1,52 @@
+# Production Dockerfile for Next.js Frontend
+# Multi-stage build using standalone output
+# Build context: project root (docker-compose.prod.yml sets context: .)
+
+# Stage 1: deps — install dependencies
+FROM node:20-alpine AS deps
+WORKDIR /app
+
+COPY frontend/package.json frontend/package-lock.json* ./
+RUN npm ci
+
+# Stage 2: builder — build the app
+FROM node:20-alpine AS builder
+WORKDIR /app
+
+COPY --from=deps /app/node_modules ./node_modules
+COPY frontend/ .
+
+# NEXT_PUBLIC_* vars are baked into the client bundle at build time.
+# Pass via --build-arg or docker compose build args.
+ARG NEXT_PUBLIC_API_URL
+ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
+
+ENV NEXT_TELEMETRY_DISABLED=1
+RUN npm run build
+
+# Stage 3: runner — minimal production image
+FROM node:20-alpine AS runner
+WORKDIR /app
+
+ENV NODE_ENV=production
+ENV NEXT_TELEMETRY_DISABLED=1
+
+RUN addgroup --system --gid 1001 nodejs && \
+    adduser --system --uid 1001 nextjs
+
+# Copy standalone output (requires output: 'standalone' in next.config.ts)
+COPY --from=builder /app/public ./public
+COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./
+COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static
+
+USER nextjs
+
+EXPOSE 3000
+
+ENV PORT=3000
+ENV HOSTNAME="0.0.0.0"
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+    CMD wget --no-verbose --tries=1 --spider http://localhost:3000/ || exit 1
+
+CMD ["node", "server.js"]
diff --git a/Dockerfile.prod b/Dockerfile.prod
new file mode 100644
index 0000000..9eecf6c
--- /dev/null
+++ b/Dockerfile.prod
@@ -0,0 +1,47 @@
+# Production Dockerfile for API + Worker
+# Multi-stage build optimized for small image size
+
+# Stage 1: builder — install system deps and Python packages
+FROM python:3.12-slim AS builder
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build
+
+COPY requirements.txt requirements_api.txt ./
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt -r requirements_api.txt
+
+# Stage 2: runtime image
+FROM python:3.12-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Non-root user for security
+RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
+
+# Copy installed packages from builder
+COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+WORKDIR /app
+
+# Copy application code
+COPY . .
+
+# Create blob storage directory
+RUN mkdir -p /app/blob_storage && chown -R appuser:appuser /app
+
+USER appuser
+
+EXPOSE 8000
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')"
+
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/ENGINEERING_SHOWCASE.md b/ENGINEERING_SHOWCASE.md
new file mode 100644
index 0000000..030ad4c
--- /dev/null
+++ b/ENGINEERING_SHOWCASE.md
@@ -0,0 +1,612 @@
+# Singularity — Engineering Showcase
+
+> AI-powered deep-research platform. Multi-agent LLM pipeline plans, retrieves, writes, and polishes fully-cited research reports. Full-stack web application with real-time streaming, BYOK LLM keys, per-report Q&A chat, and production deployment on a $25/month AWS VM.
+
+---
+
+## The Build
+
+| Metric | Number |
+|--------|--------|
+| Total source lines of code | ~19,000 (Python + TypeScript) |
+| Python backend | 18,996 lines across 207 files |
+| TypeScript frontend | 6,469 lines across 42 files |
+| Development duration | **13 days** (Mar 22 – Apr 3, 2026) |
+| Commits | 65 |
+| Sole contributor | Nishant Sharma |
+
+### Backend LOC by Module
+
+| Module | Lines | Purpose |
+|--------|-------|---------|
+| `agents/` | 5,455 | Multi-agent orchestration, planning, retrieval, writing, polish |
+| `api/` | 3,738 | FastAPI REST server, auth, middleware, SSE streaming |
+| `skills/` | 1,775 | 44 pluggable skills across 3 tiers |
+| `render/` | 999 | Self-contained HTML report renderer (KaTeX + Marked.js) |
+| `tools/` | 1,386 | 14 external data connector adapters |
+| `workers/` | 932 | ARQ background job workers with orphan recovery |
+| `db/` | 770 | SQLAlchemy async ORM models + Alembic migrations |
+| `vector_store/` | 528 | Qdrant wrapper with fastembed embeddings |
+| `models/` | 518 | Pydantic/dataclass contracts |
+| `llm/` | 375 | Multi-provider LLM abstraction (Grok, Gemini, DeepSeek) |
+| `patch/` | 341 | Report patch/edit service |
+| `trace/` | 337 | Execution trace logger |
+| `citations/` | 191 | Source provenance and bibliography formatting |
+| `storage/` | 175 | Blob storage abstraction (local + S3) |
+| `utils/` | 126 | JSON parser utilities |
+| `context/` | 102 | Context budget manager for bounded prompts |
+
+---
+
+## Architecture
+
+### Phase-5 Research Pipeline
+
+The core insight: **planning runs before retrieval** so every query targets a real planned section, not a vague topic.
+
+```
+Phase B — Planning    :  3 Manager agents propose report trees in parallel → Lead synthesizes
+Phase A — Retrieval   :  Tree-informed skill selection → targeted query fanout → source gate → Qdrant
+Phase C — Writing     :  Workers write sections bottom-up with evidence augmentation + faithfulness checks
+Phase D — Polish      :  Deterministic cleanup + LLM creative formatting
+```
+
+Phase letters are intentionally out of alphabetical order (B→A→C→D) to reflect the architectural decision that planning (B) must precede retrieval (A).
+
+### Full-Stack Data Flow
+
+```
+Browser (Next.js 16 + React 19)
+  ├── Google OAuth → FastAPI /api/v1/auth/google
+  ├── Dashboard → POST /api/v1/research/jobs (submit research)
+  ├── SSE stream → GET /api/v1/research/jobs/{id}/events (real-time progress)
+  ├── Report viewer → GET /api/v1/reports/{id}/versions/{v}
+  ├── Report edit → POST /api/v1/reports/{id}/versions/{v}/patch
+  └── Chat → POST /api/v1/threads/{id}/messages
+       │
+       └──→ FastAPI (uvicorn)
+              ├── Middleware: RequestID → Auth → RateLimit → UsageEmitter → CORS
+              ├── ARQ Redis: enqueue_job → Worker process
+              │     ├── run_research_job → run_pipeline() → ReportVersion
+              │     ├── run_summary_job → thread summarization
+              │     └── run_patch_job → report edits
+              ├── PostgreSQL 16: 8 tables, 12 indexes, 5 migrations
+              ├── Qdrant: per-run vector collections for RAG retrieval
+              └── Blob Storage: local filesystem or S3
+```
+
+### LLM Architecture — Pure BYOK
+
+**Every single LLM call in the entire pipeline uses the user's own model + their own API key.** The platform never pays for inference.
+
+How it works (`workers/research_job.py`):
+
+1. User selects a model from the dashboard
+2. Worker resolves their stored BYOK key for that model's provider
+3. One `_pipeline_llm_client(model_id, llm_api_key)` is created
+4. That single client is passed to every agent — planner, managers, lead, workers, retriever, source gate, polisher
+
+The `config.py` model constants are legacy — only used by the old CLI/DAG orchestrator path. The production Phase 5 pipeline creates one client from the user's chosen model and routes every call through it.
+
+**10 models available** across 3 providers:
+
+| Provider | Models | Notes |
+|----------|--------|-------|
+| **xAI Grok** | grok-3, grok-3-mini, grok-3-mini-fast | Flagship + fast variants |
+| **Google Gemini** | gemini-2.5-pro-preview-03-25, gemini-2.5-flash-preview-04-17, gemini-2.0-flash, gemini-2.0-flash-thinking-exp | Pro, Flash, and Thinking variants |
+| **DeepSeek** | deepseek-chat (V3), deepseek-reasoner (R1) | Standard + chain-of-thought |
+
+BYOK keys are **Fernet-encrypted at rest** in PostgreSQL, decrypted only at job runtime. One key per user per provider.
+
+### 3 Intensity Tiers
+
+The `StrengthConfig` class scales 14 independent parameters based on 3 user-selectable intensity levels:
+
+| Parameter | Low (1) | Medium (2) | High (3) |
+|-----------|---------|------------|----------|
+| Internal scale (legacy) | 3 | 6 | 10 |
+| Retrieval skills activated | 5 | 10 | 18 |
+| Queries per skill | 4 | 6 | 10 |
+| Section count range | 18–30 | 36–60 | 60–100 |
+| Augmentation iterations (per leaf) | 2 | 2 | 4 |
+| Web escalations (per leaf) | 1 | 1 | 3 |
+| Min results per query | 6 | 12 | 20 |
+| Expected LLM calls | ~21 | ~77 | ~130+ |
+
+---
+
+## Skills System — 44 Pluggable Skills
+
+Auto-registered via `__init_subclass__` metaclass. Zero manual wiring. Each skill is a self-contained directory with its own module + markdown doc.
+
+### Tier 1 — Retrieval (18 skills)
+
+| Skill | Data Source |
+|-------|------------|
+| `web_search` | DuckDuckGo + Tavily |
+| `academic_search` | Semantic Scholar, ArXiv |
+| `pdf_deep_extract` | Direct PDF parsing (pdfplumber + PyMuPDF) |
+| `video_search` | YouTube transcript extraction |
+| `code_search` | GitHub API |
+| `dataset_search` | HuggingFace Hub |
+| `clinical_search` | ClinicalTrials.gov |
+| `legal_search` | CourtListener |
+| `financial_search` | SEC EDGAR filings |
+| `patent_search` | Patent databases |
+| `gov_search` | Government portals |
+| `news_archive` | News archive search |
+| `book_search` | Google Books API |
+| `social_search` | Social media sources |
+| `forum_search` | Forum/thread search |
+| `standards_search` | Standards bodies (ISO, IEEE, etc.) |
+| `multimedia_search` | Multimedia content |
+| `data_extraction` | Structured data extraction |
+
+### Tier 2 — Analysis (18 skills)
+
+`causal_analysis`, `citation_graph`, `claim_verification`, `comparative_analysis`, `contradiction_detect`, `credibility_score`, `entity_extraction`, `fallback_router`, `gap_analysis`, `hypothesis_gen`, `meta_analysis`, `quality_check`, `sentiment_cluster`, `statistical_analysis`, `synthesis`, `timeline_construct`, `translation`, `trend_analysis`
+
+### Tier 3 — Output (8 skills)
+
+`annotation_gen`, `bibliography_gen`, `decision_matrix`, `exec_summary`, `explainer`, `knowledge_delta`, `report_generator`, `visualization_spec`
+
+---
+
+## 14 External Data Connectors
+
+Each tool is an isolated adapter behind a base class, with per-attempt timeout protection:
+
+| Connector | Source |
+|-----------|--------|
+| `arxiv_api` | ArXiv preprint server |
+| `clinicaltrials` | ClinicalTrials.gov |
+| `courtlistener` | CourtListener legal database |
+| `dataset_hub` | HuggingFace datasets |
+| `github_api` | GitHub repositories (PyGithub) |
+| `google_books` | Google Books API |
+| `pdf_reader` | PDF deep extraction (pdfplumber + PyMuPDF) |
+| `pubmed_api` | PubMed/NCBI (biopython) |
+| `sec_edgar` | SEC EDGAR financial filings |
+| `semantic_scholar` | Semantic Scholar academic search |
+| `standards_fetch` | Standards bodies |
+| `translation` | Translation service |
+| `web_fetch` | Generic web page fetching |
+| `youtube_transcript` | YouTube video transcripts |
+
+Coverage spans: academic papers, legal cases, medical trials, financial filings, code repositories, video content, books, datasets, standards documents, and general web.
+
+---
+
+## Vector Store — RAG with Qdrant
+
+| Feature | Detail |
+|---------|--------|
+| Embedding model | fastembed/all-MiniLM-L6-v2 |
+| Dimensions | 384 |
+| Runtime | ONNX (no PyTorch, no GPU) |
+| Weight savings | 1.4 GB lighter than sentence-transformers |
+| Distance metric | Cosine similarity |
+| Per-run collections | Isolated Qdrant collection per research run |
+| Topic cache | 0.92 similarity threshold, 7-day TTL |
+| Batch upsert | 64 points per call |
+| Payload indexing | Credibility score filter on Qdrant payloads |
+| Thread pool | All blocking ML + Qdrant ops run in thread pool to prevent event loop starvation |
+| Cloud option | Qdrant Cloud for production (saves ~42 MB RAM on-instance) |
+
+---
+
+## Backend — FastAPI
+
+### 31 REST + SSE Endpoints across 6 Routers
+
+**Auth** (`/api/v1/auth/`):
+- `POST /google` — Google OAuth login
+- `POST /refresh` — Refresh token rotation
+- `POST /logout` — Revoke refresh token
+- `GET /sse-token` — Issue 30-second single-use SSE JWT
+
+**Research** (`/api/v1/research/`):
+- `POST /jobs` — Submit research job
+- `GET /jobs/{job_id}` — Get job status
+- `GET /jobs/{job_id}/events` — SSE stream for real-time progress
+- `POST /jobs/{job_id}/cancel` — Cancel a running job
+
+**Reports** (`/api/v1/reports/`):
+- `GET /` — List user's reports
+- `GET /{report_id}` — Report metadata
+- `DELETE /{report_id}` — Delete report
+- `GET /{report_id}/versions` — List versions
+- `GET /{report_id}/versions/{version_num}` — Get version content
+- `POST /{report_id}/versions/{version_num}/patch` — Apply LLM-powered edit
+- `GET /{report_id}/versions/{version_num}/export` — Export version
+- `GET /{report_id}/threads/default` — Get default Q&A thread
+
+**Threads** (`/api/v1/threads/`):
+- `POST /` — Create thread
+- `GET /` — List threads
+- `GET /{thread_id}` — Get thread with messages
+- `PATCH /{thread_id}` — Update thread
+- `DELETE /{thread_id}` — Delete thread
+- `POST /{thread_id}/messages` — Send message
+
+**Users** (`/api/v1/users/`):
+- `GET /me` — User profile
+- `GET /me/stats` — Usage statistics
+- `GET /me/usage` — Usage time series
+- `GET /me/usage/models` — Model breakdown
+- `GET /me/usage/devices` — Device breakdown
+- `GET /me/llm-credentials` — List BYOK keys
+- `PUT /me/llm-credentials/{provider}` — Store/update BYOK key
+- `DELETE /me/llm-credentials/{provider}` — Delete BYOK key
+
+**LLM** (`/api/v1/llm/`):
+- `GET /models` — List available models
+
+### Middleware Stack
+
+Execution order: `RequestID → Auth → RateLimit → UsageEmitter → CORS`
+
+| Middleware | Purpose |
+|------------|---------|
+| `RequestIDMiddleware` | UUID per request, `X-Request-ID` header propagated request→response |
+| `AuthMiddleware` | JWT validation on all `/api/v1/*` routes (except whitelisted paths) |
+| `RateLimitMiddleware` | Redis sorted-set sliding window, 60 req/min per user, fail-open on Redis outage |
+| `UsageEmitterMiddleware` | Fire-and-forget `UsageEvent` record per request (async, never blocks response) |
+| `CORSMiddleware` | Dev: regex-matched localhost; Prod: explicit origin list |
+
+### Auth & Security
+
+| Feature | Implementation |
+|---------|---------------|
+| OAuth | Google ID token verification via `google.oauth2.id_token` |
+| Access tokens | JWT, 15-minute expiry, HS256, `type=access` claim enforced |
+| Refresh tokens | Opaque, SHA-256 hashed at rest, 30-day expiry |
+| Token rotation | Family-based rotation with **reuse detection** — reused token revokes entire family, forces re-auth |
+| SSE auth | Separate 30-second single-use JWT (`type=sse`) — prevents long-lived token in query params |
+| BYOK key storage | Fernet-encrypted at rest in PostgreSQL, decrypted only at job runtime |
+| Non-root Docker | Production API container runs as non-root user |
+| Security headers | Caddy sets `X-Content-Type-Options: nosniff`, `X-Frame-Options: DENY`, `X-XSS-Protection: 1`, `Referrer-Policy: strict-origin-when-cross-origin` |
+| SSL | Caddy auto-provisions Let's Encrypt certificates |
+| Docs disabled in prod | `/docs` and `/redoc` only served when `ENVIRONMENT=development` |
+
+---
+
+## Database — PostgreSQL 16
+
+### 8 Tables, 12 Indexes, 5 Migrations
+
+**Tables** (`db/models.py`, 335 lines):
+
+| Table | Purpose | Key Columns |
+|-------|---------|-------------|
+| `users` | User accounts (Google OAuth) | `google_sub` (unique), `email` (unique), `daily_token_budget`, `is_active` |
+| `refresh_tokens` | Refresh token rotation | `token_hash` (SHA-256), `family_id`, `revoked_at` |
+| `user_llm_credentials` | BYOK encrypted API keys | `provider`, `encrypted_secret` (Fernet), unique per (user, provider) |
+| `reports` | Research reports | `query`, `strength`, `user_id` |
+| `report_versions` | Immutable version history | `version_num`, `content_inline`, `content_uri`, `content_hash` (SHA-256), `char_count` |
+| `research_jobs` | Background research jobs | `status`, `strength`, `llm_model_id`, `current_phase`, `attempts`, `max_attempts` |
+| `threads` | Chat threads per report | `report_id`, `summary`, `canonical_report_qa` |
+| `messages` | Chat messages | `role`, `content`, `token_count` |
+| `usage_events` | Append-only analytics | `event_type`, `route`, `duration_ms`, `device_type`, `os`, `browser`, `ip_address` |
+
+**Indexes**:
+
+| Index | Table | Columns | Type |
+|-------|-------|---------|------|
+| `ix_users_google_sub` | users | `google_sub` | B-tree, unique |
+| `idx_rt_user_id` | refresh_tokens | `user_id` | B-tree |
+| `uq_user_llm_provider` | user_llm_credentials | `user_id, provider` | Unique constraint |
+| `idx_user_llm_credentials_user` | user_llm_credentials | `user_id` | B-tree |
+| `idx_reports_user_id` | reports | `user_id, created_at` | Compound |
+| `uq_report_version` | report_versions | `report_id, version_num` | Unique constraint |
+| `idx_rv_report_id` | report_versions | `report_id, version_num` | Compound |
+| `idx_rj_idempotency` | research_jobs | `user_id, idempotency_key` | Partial unique (PostgreSQL `WHERE idempotency_key IS NOT NULL`) |
+| `idx_rj_user_status` | research_jobs | `user_id, status` | Compound |
+| `idx_threads_user_id` | threads | `user_id, created_at` | Compound |
+| `idx_messages_thread` | messages | `thread_id, created_at` | Compound |
+| `idx_ue_user_day` | usage_events | `user_id, created_at` | Compound |
+| `idx_ue_event_type` | usage_events | `user_id, event_type, created_at` | Compound |
+
+**Connection pooling** (`db/session.py`):
+
+| Setting | Value |
+|---------|-------|
+| `pool_size` | 5 |
+| `max_overflow` | 10 (burst up to 15 total) |
+| `pool_pre_ping` | True (auto-detect stale connections) |
+| `pool_recycle` | 300s (recycle connections every 5 min) |
+| Driver | asyncpg (fully async) |
+| Session | `async_sessionmaker`, `expire_on_commit=False` |
+
+### PostgreSQL Tuned for 2GB VM
+
+| Parameter | Value | Notes |
+|-----------|-------|-------|
+| `shared_buffers` | 64 MB | 25% of container's 256 MB cap |
+| `effective_cache_size` | 128 MB | |
+| `work_mem` | 2 MB | |
+| `maintenance_work_mem` | 32 MB | |
+| `max_connections` | 50 | |
+
+---
+
+## Worker System — ARQ (Redis Queue)
+
+| Setting | Value |
+|---------|-------|
+| Max concurrent jobs | 4 global |
+| Job timeout | 30 minutes hard cap |
+| Max retries | 3 attempts |
+| Result retention | 1 hour in Redis |
+| Job functions | `run_research_job`, `run_debug_mock_research_job`, `run_patch_job`, `run_summary_job` |
+
+### Job Lifecycle
+
+1. API creates `Report` + `ResearchJob`, enqueues to ARQ
+2. Worker picks up job, marks `running`, publishes SSE `job_status` event
+3. `run_pipeline()` executes phases B → A → C → D with phase + activity callbacks
+4. Each callback publishes SSE events via Redis pub/sub (`job:{id}:events` channel)
+5. On success: saves `ReportVersion` (inline if <500 KB, else blob storage), marks `done`
+6. On cancel: `CancelToken` checks `expires_at` at every phase callback
+7. On failure: increments `attempts`, marks `failed` if exhausted, re-raises for ARQ retry otherwise
+
+### Orphan Recovery
+
+On worker startup, `_recover_orphaned_jobs()` scans for jobs in `running` or `pending` state (left over from a previous crash), marks them `failed`, and publishes `job_error` SSE events so the frontend surfaces the error immediately.
+
+### Idempotency
+
+Job creation supports an `idempotency_key`. If a job with the same key exists within the last 24 hours (enforced by a PostgreSQL partial unique index), the existing job is returned instead of creating a duplicate.
+
+---
+
+## Concurrency & Rate Limiting
+
+| Control | Value | Implementation |
+|---------|-------|----------------|
+| Max concurrent jobs (global) | 4 | ARQ `max_jobs` |
+| Max concurrent jobs per user | 2 | DB query count on `pending`/`running` jobs |
+| API rate limit | 60 req/min per user | Redis sorted-set sliding window (ZREMBYRANGE + ZCARD + ZADD in atomic pipeline) |
+| Daily token budget per user | 1,000,000 tokens | Sum of `UsageEvent.prompt_tokens + completion_tokens` for today |
+| Job expiry | 35 minutes from creation | `CancelToken` checks at every phase callback |
+| Rate limit fail-open | Yes | If Redis is unreachable, requests pass through with logged warning |
+
+---
+
+## Redis Layer
+
+| Setting | Value |
+|---------|-------|
+| Image | `redis:7-alpine` |
+| Memory cap | 128 MB container, 96 MB Redis `maxmemory` |
+| Eviction policy | `allkeys-lru` — self-cleaning under pressure |
+| Persistence | AOF (`appendonly yes`) + RDB snapshots (`save 60 1000`) |
+| Role | Job queue (ARQ) + SSE pub/sub channels + rate limiting sorted sets |
+
+Rate limiting uses a **real sliding window** (not a fixed counter): `ZREMBYRANGE` trims old entries, `ZCARD` counts the current window, `ZADD` adds the new request — all in a single Redis pipeline transaction. TTL auto-cleans the key.
+
+---
+
+## Real-Time Streaming Architecture
+
+```
+Worker (Python) → Redis pub/sub → FastAPI SSE endpoint → Browser EventSource
+```
+
+- Per-job Redis channel: `job:{job_id}:events`
+- Two event types: `job_status` (phase transitions) + `job_activity` (granular progress)
+- Frontend `ResearchOperationsFeed` renders a real-time storyboard
+- Activity events: `pipeline_start`, `domain_classified`, `managers_spawn`, `retrieval_plan_ready`, `retrieval_skill_finished`, `writers_depth`, `polish_started`, `polish_finished`
+- 10-minute warning shown on frontend when job runs long
+
+---
+
+## Frontend — Next.js 16 + React 19
+
+| Page | File | Lines | Purpose |
+|------|------|-------|---------|
+| Landing | `src/app/page.tsx` | — | Product landing page |
+| Dashboard | `src/app/dashboard/page.tsx` | 764 | Research submission + job history |
+| Report Viewer | `src/app/reports/[id]/page.tsx` | 889 | Report display + TOC + patch modal |
+| Profile | `src/app/profile/page.tsx` | — | BYOK key management + usage stats |
+
+### Key Components (19 total)
+
+| Component | Purpose |
+|-----------|---------|
+| `ChatPanel` (703 lines) | Full chat interface with streaming responses |
+| `ReportViewer` | Markdown rendering with KaTeX math + GFM tables |
+| `ReportTOC` | Table of contents sidebar |
+| `ResearchOperationsFeed` | Real-time activity feed during research |
+| `PatchModal` | Report editing via LLM patches |
+| `SelectionToolbar` | Text selection actions |
+| `ChatModelPicker` | Model selection for chat |
+| `MessageBubble` | Chat message rendering |
+| `StreamingMessage` | Streaming chat message display |
+| `chat_history_sidebar` | Chat thread navigation |
+| `profile_llm_keys_section` | BYOK key management UI |
+
+### Frontend Libraries
+
+| Category | Libraries |
+|----------|-----------|
+| Framework | Next.js 16, React 19 |
+| Auth | NextAuth v5 (beta) |
+| State | Zustand |
+| Data fetching | TanStack React Query |
+| UI primitives | Radix UI |
+| Charts | Recharts |
+| Animation | Framer Motion |
+| Math rendering | KaTeX |
+| Markdown | react-markdown, remark-gfm, remark-math, rehype-katex |
+| Styling | Tailwind CSS 4 |
+| Icons | Lucide React |
+| Toasts | Sonner |
+
+### Frontend Lib Layer (12 modules)
+
+| Module | Purpose |
+|--------|---------|
+| `api.ts` | API client (10 KB) |
+| `sse.ts` | Server-Sent Events client |
+| `research_activity_presenter.ts` | Activity feed data transformation |
+| `normalize_math_markdown.ts` | Math markdown normalization |
+| `normalize_gfm_pipe_tables.ts` | GFM table normalization |
+| `normalize_chat_assistant_markdown.ts` | Chat markdown normalization |
+| `llm_model_groups.ts` | Model grouping for UI |
+| `byok_recommended_models.ts` | BYOK model recommendations |
+| `public_api_base_url.ts` | Public API URL resolution |
+| `debug_research_mock.ts` | Debug mock research mode |
+| `cn.ts` | Class name utility |
+| `utils.ts` | General utilities |
+
+---
+
+## Observability
+
+| Layer | What's Tracked |
+|-------|---------------|
+| **Request tracing** | `X-Request-ID` on every request/response, propagated through all middleware |
+| **Usage analytics** | `UsageEvent` table — append-only records: event_type, route, duration_ms, success, status_code, user_agent, IP, device_type, OS, browser, session_id |
+| **Phase tracking** | `ResearchJob.current_phase` column updated in DB at each phase transition + SSE event |
+| **Activity feed** | `job_activity` SSE events for pipeline lifecycle milestones |
+| **Sentry** | Optional — FastAPI + SQLAlchemy integrations, 20% trace sample rate |
+| **Execution trace** | `TraceLogger` module — structured debug logging for full pipeline observability |
+| **Job timing** | `created_at`, `started_at`, `finished_at`, `expires_at` — full lifecycle columns |
+
+---
+
+## Resilience & Fault Tolerance
+
+| Scenario | Behavior |
+|----------|----------|
+| Worker crash mid-job | On restart, orphan recovery scans for `running` + `pending` jobs, marks them failed, publishes SSE error |
+| Job timeout | `CancelToken` checks `expires_at` at every phase callback, raises `asyncio.CancelledError` |
+| LLM 429 rate limit | Caught at job creation, returns 429 with actionable message |
+| Tool/adapter hang | Per-attempt timeout on all 14 data connector tools |
+| Redis down | Rate limiter fails open (passes through, logs warning); healthcheck catches it |
+| PostgreSQL down | Healthcheck `/api/ready` catches it; connection pool `pre_ping` auto-recovers |
+| OOM | Every Docker container has hard memory cap; worker gets 1 GB headroom |
+| Large reports | Inline storage if <500 KB, automatic S3/blob offload if larger |
+| Duplicate job submission | Idempotency key with PostgreSQL partial unique index (24-hour window) |
+
+---
+
+## Storage
+
+| Mode | Trigger | Storage |
+|------|---------|---------|
+| Inline | Report < 500 KB | `content_inline` column in `report_versions` table |
+| Blob overflow | Report >= 500 KB | S3, Cloudflare R2, MinIO, or local filesystem |
+| Content integrity | Always | SHA-256 hash stored in `content_hash` column |
+| Versioning | Always | Append-only `ReportVersion` rows with monotonically increasing `version_num` |
+
+---
+
+## Domain Intelligence
+
+- **11 domain categories** with per-domain retrieval strategies: ML Research, Legal, Medical/Clinical, Journalism, Market Research, Policy Analysis, Engineering Standards, Historical/Humanities, Product/UX, Finance/Investment, General
+- **110 sub-domain entries** with specialized source recommendations
+- **5 audience output rules** — adjusts report style based on intended audience
+- **2-pass source gate** — credibility filtering before and after retrieval
+
+---
+
+## Other Engineering Details
+
+| Component | Detail |
+|-----------|--------|
+| **ContextBudgetManager** | Bounded prompt assembly: direct deps get 3,000 tokens, indirect get 350 chars, hard 32K char total cap, credibility-ordered truncation |
+| **CitationRegistry** | Full provenance tracking from source → chunk → report section → bibliography |
+| **HTML Report Renderer** | 996-line self-contained HTML generator with embedded KaTeX + Marked.js, light theme, LaTeX math, GFM tables |
+| **Report Patch System** | Slug generation, validation, atomic edit application via LLM |
+| **Blob Storage Abstraction** | `LocalStorage` and `S3BlobStore` behind common interface, configurable via `BLOB_STORE` env var |
+| **Structured JSON extraction** | Robust parser for extracting JSON from LLM output with markdown fences, extra prose, or malformed quotes |
+| **Junk URL filter** | Frozenset of 9 known junk/redirector domains filtered from reference lists |
+
+---
+
+## Production Deployment
+
+### AWS t3-small (2 GB RAM) — ~$25/month
+
+| Service | Image | Memory Cap | Notes |
+|---------|-------|------------|-------|
+| Caddy | `caddy:2-alpine` | 64 MB | Reverse proxy + auto-SSL (Let's Encrypt) |
+| PostgreSQL 16 | `postgres:16-alpine` | 256 MB | Tuned for low-memory |
+| Redis 7 | `redis:7-alpine` | 128 MB | LRU eviction, AOF persistence |
+| API Server | Multi-stage Python 3.12-slim | 256 MB | Non-root user in production |
+| Background Worker | Same image | 1 GB | ML + LLM call headroom |
+| Next.js Frontend | Standalone build | 128 MB | Pre-built on Docker Hub |
+
+**Total committed: ~1.8 GB** on a 2 GB box.
+
+### Docker Strategy
+
+- Multi-stage builds for both API and frontend
+- Production images pre-built and pulled from Docker Hub (`hellonish/singularity-api`, `hellonish/singularity-frontend`)
+- `docker-compose.prod.yml` — pull-only, no on-server builds
+- `docker-compose.test.yml` — simulates t3-small memory constraints locally
+- Every container has healthchecks
+- Qdrant offloaded to Qdrant Cloud (saves ~42 MB on-instance)
+
+### CI/CD — GitHub Actions
+
+Pipeline runs on every push/PR:
+- Ubuntu runner with PostgreSQL 16 + Redis 7 service containers
+- Python 3.12, pip install, Alembic migrations against real PostgreSQL, pytest
+- Qdrant in `FORCE_IN_MEMORY=1` mode for tests
+- 19 test files across skills and tool integration tests
+
+---
+
+## The Cost Model
+
+The platform **never pays for LLM inference**. Every call uses the user's BYOK key. Infrastructure cost is fixed:
+
+| Resource | Monthly Cost |
+|----------|-------------|
+| AWS t3-small (2 GB) | ~$25 |
+| Qdrant Cloud (free tier) | $0 |
+| Domain + SSL (Caddy / Let's Encrypt) | ~$1 |
+| **Total** | **~$26/month** |
+
+Fixed-cost, zero-marginal-cost-per-user model for compute. Variable costs limited to Postgres storage growth (reports, usage events), bounded by the daily token quota per user.
+
+---
+
+## Development Journey
+
+| Phase | Date | Milestone |
+|-------|------|-----------|
+| Foundation | Mar 22 | Base deep research agent with DAG orchestrator |
+| Skills + Tools | Mar 25 | 44 skills, 14 tools, planner, LLM routing |
+| Orchestration v2 | Mar 26 | Phase-5 pipeline (planning before retrieval), HTML renderer |
+| Report Quality | Mar 26 | LaTeX math, GFM tables, citation formatting, JSON parsing |
+| Web Application | Apr 01–02 | Next.js 16 frontend, FastAPI backend, Google OAuth, SSE streaming |
+| Production Hardening | Apr 03 | Docker optimization, AWS t3-small deployment, healthchecks |
+| Worker Resilience | Apr 03 | OOM fix, orphan recovery, rate limit handling, timeout protection |
+| Embedding Optimization | Apr 03 | sentence-transformers → fastembed (1.4 GB lighter), Qdrant payload indexing |
+
+---
+
+## Summary Numbers
+
+| | |
+|---|---|
+| Lines of code | ~19,000 (Python) + ~6,500 (TypeScript) |
+| Backend files | 207 Python |
+| Frontend files | 42 TypeScript |
+| Pluggable skills | 44 (18 retrieval + 18 analysis + 8 output) |
+| Data connectors | 14 tools |
+| LLM providers | 3 (xAI Grok, Google Gemini, DeepSeek) |
+| Available models | 10 |
+| API endpoints | 31 |
+| Database tables | 8 |
+| Database indexes | 12 (including 1 partial unique) |
+| Migrations | 5 |
+| Middleware layers | 5 (RequestID, Auth, RateLimit, UsageEmitter, CORS) |
+| Docker containers | 6 in production |
+| Production VM | AWS t3-small, 2 GB RAM, ~$25/month |
+| Total prod memory | ~1.8 GB across 6 containers |
+| Build time | 13 days, 65 commits, 1 engineer |
diff --git a/README.md b/README.md
index 4cdbf5d..6bf8b34 100644
--- a/README.md
+++ b/README.md
@@ -1,489 +1,219 @@
 # Singularity
 
-Evidence-driven technical documentation for the current codebase implementation.
+An AI-powered deep-research platform that orchestrates a multi-agent LLM pipeline to plan, retrieve, write, and polish fully-cited research reports — with real-time streaming, per-report Q&A chat, and production deployment on a $25/month AWS VM.
 
-This README follows `how_to_document_code.md`:
-- documents only implemented behavior,
-- maps implementation to engineering concepts,
-- states design choices with strengths and limitations,
-- explains serialized vs parallel flow,
-- includes practical usage and cost/scale approximations.
-
----
-
-## 0) Quick Start
-
-```bash
-# Setup
-python3 -m venv .venv
-source .venv/bin/activate
-pip install -r requirements.txt
-
-# Primary path (Phase-5)
-python -m agents.orchestrator.cli "research question" --strength 5 --audience expert
-
-# Legacy DAG path
-python -m agents.orchestrator.cli "research question" --depth standard
-
-# Chat REPL
-python -m agents.chat.cli
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                        Browser (Next.js 16)                         │
+│  Dashboard │ Report Viewer │ Chat Panel │ Profile (BYOK Keys)       │
+└──────┬──────────────┬──────────────┬──────────────┬─────────────────┘
+       │              │              │              │
+       ▼              ▼              ▼              ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│                     Caddy (auto-SSL, Let's Encrypt)                  │
+│                      :80/:443 → reverse proxy                        │
+└──────┬──────────────────────┬────────────────────────────────────────┘
+       │ /api/*               │ /*
+       ▼                      ▼
+┌──────────────────┐  ┌───────────────────┐
+│   FastAPI API    │  │  Next.js Frontend  │
+│   (256 MB cap)   │  │   (128 MB cap)     │
+└──┬───┬───┬───┬───┘  └───────────────────┘
+   │   │   │   │
+   │   │   │   └──── UsageEmitter (fire-and-forget analytics)
+   │   │   └──────── RateLimitMiddleware (Redis sliding window)
+   │   └───────────── AuthMiddleware (JWT + Google OAuth)
+   └───────────────── RequestIDMiddleware (UUID per request)
+       │
+       ▼ enqueue_job()
+┌──────────────────┐
+│   ARQ Worker     │
+│   (1 GB cap)     │
+│                  │
+│  Phase B ──────▶ Planning: 3 Managers propose trees → Lead synthesizes
+│       │          (asyncio.gather — parallel)
+│       ▼
+│  Phase A ──────▶ Retrieval: tree-informed skill selection → 18 skills
+│       │          → source gate → chunk + embed → Qdrant
+│       ▼
+│  Phase C ──────▶ Writing: bottom-up section workers + augmentation
+│       │          loops + faithfulness checks
+│       ▼
+│  Phase D ──────▶ Polish: deterministic cleanup + LLM formatting
+│                  │
+│                  └─── User's BYOK model + API key for EVERY LLM call
+└──┬───────┬───────┘
+   │       │
+   ▼       ▼
+┌──────┐ ┌──────────────┐
+│Redis │ │  PostgreSQL   │
+│128MB │ │    256MB       │
+└──────┘ └──────────────┘
 ```
 
 ---
 
-## 1) Navigation
-
-- [2) Project Purpose](#2-project-purpose)
-- [3) Runtime Interfaces](#3-runtime-interfaces)
-- [4) Repository Structure](#4-repository-structure)
-- [5) Architecture](#5-architecture)
-- [6) Execution Semantics](#6-execution-semantics)
-- [7) Core Modules and Concepts](#7-core-modules-and-concepts)
-  - [7.1 Agents](#71-agents)
-  - [7.2 Skills and Tier Breakdown](#72-skills-and-tier-breakdown)
-  - [7.3 Tools](#73-tools)
-  - [7.4 CitationRegistry](#74-citationregistry)
-  - [7.5 ContextBudgeting](#75-contextbudgeting)
-  - [7.6 LLM Clients and Router](#76-llm-clients-and-router)
-  - [7.7 Models and Contracts](#77-models-and-contracts)
-  - [7.8 Vector Store](#78-vector-store)
-- [8) Generalized Process Pipelines](#8-generalized-process-pipelines)
-- [9) Scale and Cost Approximation](#9-scale-and-cost-approximation)
-- [10) Version Timeline (V1 -> V2)](#10-version-timeline-v1---v2)
-- [11) Decision Register](#11-decision-register)
-- [12) Practical Command Surface](#12-practical-command-surface)
-- [13) Notes](#13-notes)
-
----
-
-## 2) Project Purpose
-
-Singularity is a research-agent codebase with two active execution paths:
-
-1. **Legacy DAG orchestrator** via `run_orchestrator`
-2. **Phase-5 pipeline (primary)** via `run_pipeline`
-
-Current system composition:
-- planning agents,
-- retrieval + analysis + output skills,
-- external data tools,
-- LLM reasoning/writing clients,
-- Qdrant-backed retrieval context,
-- report assembly + polish.
-
-Primary artifact:
-- report rendered to `final_report.html` by orchestrator CLI.
-
----
-
-## 3) Runtime Interfaces
-
-### 3.1 Orchestrator CLI
-
-- **Path**: `agents/orchestrator/cli.py`
-- **Purpose**: one-shot research execution and report rendering.
-
-```bash
-# Phase-5 product path (primary)
-python -m agents.orchestrator.cli "your question" --strength 5 --audience practitioner
-
-# Legacy DAG path
-python -m agents.orchestrator.cli "your question" --depth standard
-
-# Optional trace export
-python -m agents.orchestrator.cli "your question" --strength 7 --trace
-```
-
-### 3.2 Chat REPL
+## Interesting Engineering Decisions
 
-- **Path**: `agents/chat/cli.py`
-- **Purpose**: interactive dual-mode session (`chat` / `research`).
+### 1. Planning Before Retrieval
 
-```bash
-python -m agents.chat.cli
-python -m agents.chat.cli --extended
-python -m agents.chat.cli --model grok-3
-```
+The pipeline phases are intentionally ordered B → A → C → D (out of alphabetical order). Planning runs **before** retrieval because the original DAG orchestrator (retrieval-first) produced drifted, unfocused evidence — it gathered sources against a vague topic and then tried to shape a report around whatever it found.
 
-Implemented in-session commands:
-- `/model`, `/mode`, `/extended`, `/clear`, `/history`, `/skills`, `/quit`
+By committing to a section tree first, every retrieval query targets a real planned section. The Lead agent synthesizes three independent Manager proposals into a final tree, and then the Retriever maps skills to specific leaf sections. This costs extra LLM calls upfront (4 for planning) but dramatically improves evidence relevance and reduces wasted retrieval budget at high strengths.
 
----
+### 2. fastembed Over sentence-transformers
 
-## 4) Repository Structure
-
-```text
-agents/
-  chat/                # Thinker + chat-mode executor + REPL
-  orchestrator/        # CLI, legacy runner, phase-5 pipeline, config, strength
-  planner/             # Domain registry and planner assets
-  report_manager/      # 3 manager proposal generation
-  report_lead/         # proposal synthesis to final tree
-  report_worker/       # section writing
-  retriever/           # phase-A retrieval planner and fanout
-  source_gate/         # source filtering
-  polish.py            # report polishing
-
-skills/
-  base.py
-  registry.py
-  skill_docs.py        # skill.md integration layer
-  tier1_retrieval/     # 18 retrieval skills
-  tier2_analysis/      # 18 analysis skills
-  tier3_output/        # 8 output skills
-
-tools/                 # external data connectors
-vector_store/          # Qdrant wrapper + embedding pipeline
-llm/                   # provider clients + router
-models/                # pydantic/dataclass model package
-context/               # context budgeting
-citations/             # citation registry
-render/                # HTML renderer
-trace/                 # trace logging
-```
+The original embedding pipeline used `sentence-transformers/all-MiniLM-L6-v2`, which pulls in PyTorch (~1.4 GB). On a t3-small with 2 GB RAM and six Docker containers sharing that memory, 1.4 GB of embedding library was a non-starter — the worker OOM'd after the planning phase.
 
----
+The swap to `fastembed` runs the same `all-MiniLM-L6-v2` model via ONNX Runtime instead of PyTorch. Same 384-dim embeddings, same cosine similarity quality, but the binary footprint dropped from ~1.4 GB to ~90 MB. All blocking ML and Qdrant operations also moved to a thread pool with timeouts, so the async event loop never starves during embedding or upsert calls.
 
-## 5) Architecture
+### 3. SSE Tokens Are Separate 30-Second JWTs
 
-### 5.1 Path A: Phase-5 Pipeline (primary)
+SSE endpoints accept authentication via a query parameter (`?token=...`) because `EventSource` in the browser doesn't support custom headers. Putting a long-lived access token (15 minutes) in a URL is a bad idea — URLs get logged, cached, and appear in browser history.
 
-- **Entry**: `agents/orchestrator/pipeline.py:run_pipeline`
-- **Order**:
-  1. **Phase B** planning (3 managers in parallel + 1 lead synthesis),
-  2. **Phase A** retrieval (tree-informed fanout + ingestion),
-  3. **Phase C** writing (bottom-up by tree depth),
-  4. **Phase D** polish (deterministic cleanup + LLM formatting).
+Instead, the API exposes `GET /api/v1/auth/sse-token` which issues a single-use JWT with `type=sse` and a 30-second expiry. The frontend calls this endpoint right before opening the `EventSource`, so the token in the URL is short-lived enough to be practically useless if leaked. The middleware stack skips Bearer validation on `/events` paths and validates the SSE token independently.
 
-### 5.2 Path B: Legacy DAG Orchestrator
+### 4. Redis Sorted-Set Sliding Window Instead of a Fixed Counter
 
-- **Entry**: `agents/orchestrator/runner.py:run_orchestrator`
-- **Loop**: plan DAG -> execute waves -> gap analysis -> optional replan loop.
+Rate limiting uses `ZREMBYRANGE` + `ZCARD` + `ZADD` in a single Redis pipeline transaction. Each request adds its timestamp to a sorted set keyed by user ID; expired entries are trimmed before counting.
 
----
+A naive fixed counter (INCR + EXPIRE) creates a problem at window boundaries — 60 requests at 0:59 followed by 60 requests at 1:01 passes the check despite 120 requests in two seconds. The sorted-set approach provides a true sliding window with no boundary artifacts, at the cost of slightly more Redis memory per user (one sorted set per rate-limited client).
 
-## 6) Execution Semantics
+The rate limiter also **fails open**: if Redis is unreachable, requests pass through with a logged warning instead of 500-ing every API call. This was chosen because a Redis outage shouldn't take down the entire application — degraded rate limiting is preferable to a full outage.
 
-### 6.1 Serialized segments
+### 5. Orphan Recovery on Worker Startup
 
-- `run_pipeline` phase boundaries are serialized: **B -> A -> C -> D**.
-- Lead synthesis happens after all manager proposals.
-- Phase C executes one depth level at a time (bottom-up).
+Background workers can crash mid-job (OOM, LLM timeout, deployment restart). When the new worker process starts, it runs `_recover_orphaned_jobs()` which queries for all jobs in `running` or `pending` state, marks them `failed`, and publishes `job_error` SSE events.
 
-### 6.2 Parallelized segments
+Without this, orphaned jobs sit in `running` state forever on the frontend — the progress spinner never stops, and the user has no way to know the job died. The recovery gives immediate feedback ("Worker process restarted unexpectedly. Please retry.") and frees the user's concurrency slot so they can submit again. The `pending` state was added to the recovery query after discovering that jobs queued right before a crash never transitioned to `running` but still blocked the user's concurrent job count.
 
-- Phase B managers run with `asyncio.gather`.
-- Phase A retrieval skills execute in parallel.
-- Nodes in the same Phase C depth execute in parallel.
-- Polisher runs section-level transformations in parallel.
+### 6. Pure BYOK — The Platform Never Pays for Inference
 
-### 6.3 Why this split
+Every LLM call in the pipeline uses the user's own API key and their chosen model. The worker resolves the user's BYOK key for the selected provider at job start, creates a single LLM client, and passes it to every agent — planner, managers, lead, workers, retriever, source gate, and polisher all use the same client.
 
-- **Serialization** at dependency boundaries gives deterministic progression.
-- **Parallelization** on independent units reduces wall-clock latency.
+This means the infrastructure cost is fixed at ~$26/month regardless of how many users run research or which models they choose. Users can pick from 10 models across 3 providers (xAI Grok, Google Gemini, DeepSeek), and their keys are Fernet-encrypted at rest in PostgreSQL, decrypted only at job runtime.
 
 ---
 
-## 7) Core Modules and Concepts
-
-Method used in each subsection:
-- what, why, concept family, chosen concept, why chosen, relevance, trade-offs.
-
-### 7.1 Agents
-
-- **What**: role-specialized orchestrators (`planner`, `retriever`, `manager`, `lead`, `worker`, `polisher`, `chat`).
-- **Why**: separate planning, retrieval, synthesis, and interaction concerns.
-- **Concept Family**: multi-agent orchestration / role decomposition.
-- **Chosen Concept**: hybrid architecture (phase-specialized path + retained legacy DAG path).
-- **Why Chosen**: preserve compatibility while advancing production flow.
-- **Relevance**: `run_pipeline` is explicit phase orchestration; `run_orchestrator` remains available.
-- **Trade-offs**:
-  - Positive: cleaner phase-level tuning.
-  - Negative: more prompts and coordination surfaces.
-
-### 7.2 Skills and Tier Breakdown
-
-- **What**: plugin layer with 44 concrete skills.
-  - Tier 1 Retrieval: 18
-  - Tier 2 Analysis: 18
-  - Tier 3 Output: 8
-- **Why**: modular composition of fetch -> reason -> render.
-- **Concept Family**: plugin registry + dynamic discovery + tiered processing.
-- **Chosen Concept**:
-  - `SkillBase.__init_subclass__` for class registration,
-  - `skills/registry.py` for instance registry and tier-1 derivation.
-- **Why Chosen**: avoids manual centralized skill list edits.
-- **Relevance**: planning/execution resolve through `SKILL_REGISTRY`; phase-A constrained by `TIER1_SKILLS`.
-- **Trade-offs**:
-  - Positive: additive extensibility.
-  - Negative: import-side-effect sensitivity.
-
-#### 7.2.1 Tier-1 retrieval skills (18)
-
-`academic_search`, `book_search`, `clinical_search`, `code_search`, `data_extraction`, `dataset_search`, `financial_search`, `forum_search`, `gov_search`, `legal_search`, `multimedia_search`, `news_archive`, `patent_search`, `pdf_deep_extract`, `social_search`, `standards_search`, `video_search`, `web_search`
-
-#### 7.2.2 Tier-2 analysis skills (18)
-
-`causal_analysis`, `citation_graph`, `claim_verification`, `comparative_analysis`, `contradiction_detect`, `credibility_score`, `entity_extraction`, `fallback_router`, `gap_analysis`, `hypothesis_gen`, `meta_analysis`, `quality_check`, `sentiment_cluster`, `statistical_analysis`, `synthesis`, `timeline_construct`, `translation`, `trend_analysis`
-
-#### 7.2.3 Tier-3 output skills (8)
-
-`annotation_gen`, `bibliography_gen`, `decision_matrix`, `exec_summary`, `explainer`, `knowledge_delta`, `report_generator`, `visualization_spec`
-
-### 7.3 Tools
-
-- **What**: external adapters under `tools/`: `arxiv_api`, `clinicaltrials`, `courtlistener`, `dataset_hub`, `github_api`, `google_books`, `pdf_reader`, `pubmed_api`, `sec_edgar`, `semantic_scholar`, `standards_fetch`, `translation`, `web_fetch`, `youtube_transcript`.
-- **Why**: isolate provider/API concerns from skill orchestration.
-- **Concept Family**: adapter layer with shared return contract.
-- **Chosen Concept**: standardized tool wrappers consumed by tier-1 skills.
-- **Why Chosen**: consistent metadata and failure handling surface.
-- **Relevance**: source ingestion path into Qdrant and report evidence.
-- **Trade-offs**:
-  - Positive: unified integration boundary.
-  - Negative: each adapter inherits provider fragility.
-
-### 7.4 CitationRegistry
-
-- **What**: `citations/registry.py` in-process source registry + bibliography formatting.
-- **Why**: preserve provenance and stable citation IDs.
-- **Concept Family**: registry + provenance tracking.
-- **Chosen Concept**: runtime citation-id generation with URL dedup support.
-- **Why Chosen**: stable source identity across retrieval/analysis/output.
-- **Relevance**: references and source lineage in report outputs.
-- **Trade-offs**:
-  - Positive: traceable source path.
-  - Negative: in-memory run scope, not a persistent citation DB.
-
-### 7.5 ContextBudgeting
-
-- **What**: `context/budget.py:ContextBudgetManager` for upstream context assembly.
-- **Why**: avoid context overflow while prioritizing direct dependencies.
-- **Concept Family**: budgeted context curation.
-- **Chosen Concept**: direct vs indirect tiers + total cap + truncation.
-- **Why Chosen**: bounded prompt size with dependency awareness.
-- **Relevance**: used by analysis/output skills via `ExecutionContext`.
-- **Trade-offs**:
-  - Positive: predictable upper bound and stable prompt shape.
-  - Negative: heuristic slot/dependency matching can be imperfect in edge naming.
-
-### 7.6 LLM Clients and Router
-
-- **What**: provider clients in `llm/` + `llm/router.py:get_llm_client`.
-- **Why**: central model-provider abstraction with low coupling.
-- **Concept Family**: factory + provider polymorphism.
-- **Chosen Concept**:
-  - `deepseek-*` -> `DeepSeekClient`
-  - `grok-*` -> `GrokClient`
-  - fallback -> `GeminiClient`
-- **Why Chosen**: minimal branching and explicit convention.
-- **Relevance**: chat and orchestration can change model IDs without business-logic rewrites.
-- **Trade-offs**:
-  - Positive: compact integration point.
-  - Negative: model-id naming conventions must remain consistent.
-
-### 7.7 Models and Contracts
-
-- **What**: `models/` package split:
-  - enums (`IssueType`, `NodeStatus`),
-  - plan (`PlanNode`, `Plan`, metadata),
-  - context (`ExecutionContext`),
-  - output contracts (`RetrievalOutput`, `AnalysisOutput`, `QualityReport`, `OutputDocument`, ...),
-  - chunk/storage (`DocumentChunk`, `CitationRecord`).
-- **Why**: typed inter-module contracts.
-- **Concept Family**: schema-first interfaces.
-- **Chosen Concept**: package split with compatibility re-exports in `models/__init__.py`.
-- **Why Chosen**: maintainability gain without breaking legacy imports.
-- **Relevance**: primary data boundary across all phases.
-- **Trade-offs**:
-  - Positive: clearer model ownership and typing.
-  - Negative: re-export indirection for backward compatibility.
-
-### 7.8 Vector Store
-
-- **What**: `vector_store/client.py:VectorStoreClient` for collection lifecycle, chunk ingest, search, topic cache, and TTL cleanup.
-- **Why**: retrieval-augmented section writing with chunk reuse.
-- **Concept Family**: RAG store abstraction + semantic cache.
-- **Chosen Concept**:
-  - default server mode (`QDRANT_URL`),
-  - explicit in-memory mode (`QDRANT_FORCE_IN_MEMORY=1` or constructor),
-  - lazy init + connection probe + explicit failure.
-- **Why Chosen**: explicit operational behavior in production, controlled fallback in dev.
-- **Relevance**: Phase A ingest and Phase C retrieval.
-- **Trade-offs**:
-  - Positive: clear lifecycle semantics and bounded cleanup.
-  - Negative: persistent mode adds infrastructure dependency.
+## Build & Run
 
----
+### Prerequisites
 
-## 8) Generalized Process Pipelines
+- Python 3.12+
+- Node.js 20+
+- Docker & Docker Compose
+- PostgreSQL 16, Redis 7 (provided via Docker Compose)
 
-### 8.1 Full report generation (Phase-5 primary)
+### Local Development
 
-1. Query classification + planning (managers + lead).
-2. Retrieval skill selection + query fanout.
-3. Tool fetch -> source gate -> chunking -> Qdrant ingest.
-4. Bottom-up section writing from semantic chunk retrieval.
-5. Assembly + references + polish.
-
-### 8.2 Legacy DAG execution
+```bash
+# Clone
+git clone https://github.com/hellonish/singularity.git
+cd singularity
 
-1. Planner builds DAG.
-2. Executor runs topological waves.
-3. Gap analyzer reports unresolved nodes.
-4. Optional replan loop.
-5. Output assembly.
+# Backend
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt -r requirements_api.txt
 
-### 8.3 Chat loop
+# Frontend
+cd frontend && npm install && cd ..
 
-1. Thinker selects mode (`chat` or `research`) and step plan.
-2. Chat mode executes short step plan incrementally.
-3. Research mode delegates to `run_pipeline`.
+# Environment
+cp .env.example .env
+# Fill in your API keys, OAuth credentials, etc.
 
----
+# Start infrastructure
+docker compose up postgres redis -d
 
-## 9) Scale and Cost Approximation
+# Run database migrations
+alembic upgrade head
 
-This section is approximation-only and assumption-explicit.
+# Start API server
+uvicorn api.main:app --reload --port 8000
 
-### 9.1 Strength scaling (from `StrengthConfig`)
+# Start background worker (separate terminal)
+python -m workers.main
 
-| Strength | Retrieval Skills | Queries/Skill | Retrieval Calls | Section Range | Expected LLM Calls |
-|---|---:|---:|---:|---:|---:|
-| 1 | 2 | 4 | 8 | 6-10 | 29 |
-| 3 | 5 | 4 | 20 | 18-30 | 77 |
-| 5 | 9 | 6 | 54 | 30-50 | 125 |
-| 7 | 12 | 8 | 96 | 42-70 | 173 |
-| 10 | 18 | 10 | 180 | 60-100 | 325 |
+# Start frontend (separate terminal)
+cd frontend && npm run dev
+```
 
-Additional strength-controlled parameters:
-- `min_results_per_query`: 5 -> 20
-- `min_chunks_per_leaf`: 3 -> 10
-- augmentation iterations: 2 -> 4
-- max web escalations: 1 -> 3
+### Production Deployment (AWS EC2)
 
-### 9.2 Approximate spend model
+```bash
+# On a fresh t3-small:
+git clone https://github.com/hellonish/singularity.git
+cd singularity
+cp .env.production .env
+# Edit .env with production values
+
+# Deploy
+bash deploy/deploy.sh
+docker compose -f docker-compose.prod.yml up -d
+```
 
-Current config comment rates:
-- `grok-3-mini`: $0.25 / 1M input, $0.50 / 1M output
-- `grok-3`: $3.00 / 1M input, $15.00 / 1M output
+The deploy script handles Docker Hub image pulls, environment validation, and volume setup. Production uses pre-built images — no on-server compilation.
 
-Assumptions for rough planning:
-- mini call: 2k input + 600 output tokens
-- grok-3 write/lead call: 5k input + 1.2k output tokens
-- call mix shifts toward worker calls with larger section counts
+### CLI (without the web app)
 
-Implication:
-- total run cost increases roughly with retrieval fanout + section count + augmentation loops; at high strengths, growth is multiplicative across these dimensions.
+```bash
+# Phase-5 pipeline (primary)
+python -m agents.orchestrator.cli "your research question" --strength 5 --audience expert
 
----
+# With execution trace export
+python -m agents.orchestrator.cli "your question" --strength 7 --trace
 
-## 10) Version Timeline (V1 -> V2)
-
-### 10.1 V1: Legacy DAG (`run_orchestrator`)
-
-- **Problem Identified**: need multi-step research with retry and replan behavior.
-- **Concepts Used**: DAG planning, wave execution, fallback routing, gap analysis.
-- **Possible Decisions**:
-  - linear chain,
-  - static DAG,
-  - DAG + replan loop.
-- **Final Decision**: DAG + replan loop.
-- **Reasoning**: stronger resilience than one-shot/linear execution.
-- **Strengths**:
-  - robust recovery path,
-  - explicit dependency graph.
-- **Weaknesses**:
-  - increased control-loop complexity.
-
-### 10.2 V2: Phase-5 Pipeline (`run_pipeline`, primary)
-
-- **Problem Identified**: retrieval quality drift when evidence collection is not tied to finalized section structure.
-- **Concepts Used**: phase separation, parallel proposal synthesis, tree-informed retrieval, bottom-up writing, deterministic + creative polish.
-- **Possible Decisions**:
-  - retrieval-first,
-  - planning-first with tree-informed retrieval,
-  - one-shot end-to-end generation.
-- **Final Decision**: planning-first -> retrieval -> writing -> polish.
-- **Reasoning**: section-targeted retrieval improves relevance and reduces wasted calls.
-- **Strengths**:
-  - clearer production flow,
-  - stronger section-evidence alignment,
-  - explicit cost/quality dial.
-- **Weaknesses**:
-  - more LLM calls,
-  - broader orchestration surface.
+# Interactive chat REPL
+python -m agents.chat.cli
+python -m agents.chat.cli --model grok-3
+```
 
 ---
 
-## 11) Decision Register
-
-### 11.1 Skill documentation integration (`skill.md`)
+## The Cost Model
 
-- **Hypothesis**: planner/thinker quality improves when skill selection context is generated from `skill.md` instead of hardcoded summaries.
-- **Support**: `skills/skill_docs.py` parses all `skill.md` files and feeds:
-  - compact `USE/NOT` menu to thinker,
-  - full skill contracts to report managers.
-- **Limitations**: markdown parsing remains heuristic and depends on content consistency.
-- **Strengths**: documentation-as-source-of-truth, lower drift.
-- **Decision Reasoning**: reduces manual menu maintenance and keeps prompts aligned with authored skill docs.
+| Resource | Monthly Cost |
+|----------|-------------|
+| AWS t3-small (2 vCPU, 2 GB RAM) | ~$25 |
+| Qdrant Cloud (free tier) | $0 |
+| Domain + SSL (Caddy / Let's Encrypt) | ~$1 |
+| LLM inference | **$0** (users bring their own keys) |
+| **Total** | **~$26/month** |
 
-### 11.2 Dynamic skill registration trigger
+Six Docker containers share 2 GB of RAM with hard memory caps:
 
-- **Hypothesis**: `__init_subclass__` + tier imports reduce registration boilerplate.
-- **Support**: `skills/base.py` and `skills/registry.py` auto-register and instantiate skill classes.
-- **Limitations**: import ordering/side effects still matter.
-- **Strengths**: no central list edits per skill.
-- **Decision Reasoning**: maintainability gain at current scale outweighs import-side complexity.
+| Container | Memory | Notes |
+|-----------|--------|-------|
+| Caddy | 64 MB | Reverse proxy + auto-SSL |
+| PostgreSQL 16 | 256 MB | Tuned: shared_buffers=64MB, work_mem=2MB |
+| Redis 7 | 128 MB | LRU eviction at 96MB, AOF persistence |
+| FastAPI API | 256 MB | Async throughout (asyncpg, aiohttp, arq) |
+| ARQ Worker | 1 GB | Embedding model + LLM call headroom |
+| Next.js | 128 MB | Standalone build output |
 
-### 11.3 Qdrant connection policy
-
-- **Hypothesis**: silent in-memory fallback masks production misconfiguration.
-- **Support**: current vector client raises explicit connection errors unless in-memory mode is explicitly requested.
-- **Limitations**: stricter startup behavior requires environment discipline.
-- **Strengths**: clearer operational failure modes.
-- **Decision Reasoning**: explicit failure is preferred over hidden degraded persistence.
+**Total committed: ~1.8 GB** on a 2 GB box. Every container has healthchecks. The system runs 4 concurrent research jobs globally, 2 per user, with a 30-minute timeout per job.
 
 ---
 
-## 12) Practical Command Surface
-
-### 12.1 Environment setup
+## What's Inside
 
-```bash
-python3 -m venv .venv
-source .venv/bin/activate
-pip install -r requirements.txt
-```
-
-### 12.2 Phase-5 pipeline
-
-```bash
-python -m agents.orchestrator.cli "research question" --strength 5 --audience expert
-```
-
-### 12.3 Legacy DAG
-
-```bash
-python -m agents.orchestrator.cli "research question" --depth standard
-```
-
-### 12.4 Chat REPL
-
-```bash
-python -m agents.chat.cli
-python -m agents.chat.cli --extended
-python -m agents.chat.cli --model grok-3
-```
-
-### 12.5 Trace export
-
-```bash
-python -m agents.orchestrator.cli "research question" --strength 7 --trace
-```
+| | |
+|---|---|
+| Backend | 19,000 lines of Python across 207 files |
+| Frontend | 6,500 lines of TypeScript across 42 files |
+| Skills | 44 pluggable (18 retrieval + 18 analysis + 8 output) |
+| Data connectors | 14 tools (ArXiv, PubMed, GitHub, SEC EDGAR, YouTube, etc.) |
+| LLM providers | 3 (xAI Grok, Google Gemini, DeepSeek) — 10 models |
+| API endpoints | 31 REST + SSE endpoints |
+| Database | 8 tables, 12 indexes (including 1 partial unique), 5 migrations |
+| Middleware | 5 layers: RequestID → Auth → RateLimit → UsageEmitter → CORS |
+| Build time | 13 days, 65 commits, 1 engineer |
 
 ---
 
-## 13) Notes
+## Documentation
 
-- `ARCHITECTURE.md` remains the deeper companion reference.
-- This README intentionally avoids future-state claims and documents implemented behavior only.
+- [ARCHITECTURE.md](ARCHITECTURE.md) — current-state architecture reference
+- [ENGINEERING_SHOWCASE.md](ENGINEERING_SHOWCASE.md) — full engineering breakdown with metrics
+- [docs/PLATFORM_DEVELOPMENT_GUIDE.md](docs/PLATFORM_DEVELOPMENT_GUIDE.md) — comprehensive development guide
diff --git a/SKILLS/tier1_retrieval/base.py b/SKILLS/tier1_retrieval/base.py
index 3306df6..623d536 100644
--- a/SKILLS/tier1_retrieval/base.py
+++ b/SKILLS/tier1_retrieval/base.py
@@ -126,8 +126,9 @@ async def _fetch_and_filter(q: str) -> list[tuple[dict, str]]:
                 return []
 
             if original_query:
+                import asyncio as _asyncio
                 from agents.source_gate import pass1_filter
-                sources = pass1_filter(sources, original_query)
+                sources = await _asyncio.to_thread(pass1_filter, sources, original_query)
 
             return [(src, q) for src in sources]
 
@@ -160,25 +161,34 @@ async def _fetch_and_filter(q: str) -> list[tuple[dict, str]]:
         total_sources = 0
         all_sources: list[dict] = []
 
-        for src, q in final_pairs:
-            text = src.get("content", "") or src.get("snippet", "") or src.get("abstract", "")
-            if not text:
-                continue
-            base_cred    = src.get("credibility_base", 0.7)
-            adjusted_cred = _adjust_credibility(src.get("url", ""), base_cred)
-            chunks = vs.ingest_text(
-                collection_name=collection_name,
-                text=text,
-                run_id=run_id,
-                source_url=src.get("url", ""),
-                source_title=src.get("title", "Unknown"),
-                credibility=adjusted_cred,
-                skill=self.name,
-                query=q,
-            )
-            total_chunks  += len(chunks)
-            total_sources += 1
-            all_sources.append(src)
+        def _ingest_all_sync() -> list[tuple[int, dict]]:
+            """Run all ingestion sequentially in one thread to avoid blocking the event loop."""
+            results = []
+            for src, q in final_pairs:
+                text = src.get("content", "") or src.get("snippet", "") or src.get("abstract", "")
+                if not text:
+                    continue
+                base_cred     = src.get("credibility_base", 0.7)
+                adjusted_cred = _adjust_credibility(src.get("url", ""), base_cred)
+                chunks = vs.ingest_text(
+                    collection_name=collection_name,
+                    text=text,
+                    run_id=run_id,
+                    source_url=src.get("url", ""),
+                    source_title=src.get("title", "Unknown"),
+                    credibility=adjusted_cred,
+                    skill=self.name,
+                    query=q,
+                )
+                results.append((len(chunks), src))
+            return results
+
+        ingest_results = await asyncio.to_thread(_ingest_all_sync)
+        for chunk_count, src in ingest_results:
+            if chunk_count > 0:
+                total_chunks  += chunk_count
+                total_sources += 1
+                all_sources.append(src)
 
         # Register citations
         cit_reg = getattr(ctx, "citation_registry", None)
diff --git a/agents/orchestrator/pipeline.py b/agents/orchestrator/pipeline.py
index d740e6f..f45b4d1 100644
--- a/agents/orchestrator/pipeline.py
+++ b/agents/orchestrator/pipeline.py
@@ -702,7 +702,14 @@ async def run_pipeline(
     )
 
     # ── Topic cache check (triggers lazy Qdrant init + server probe) ──
-    cached_run_id = vs.find_cached_run(query)   # returns None if in-memory or no hit
+    try:
+        cached_run_id = await asyncio.wait_for(
+            asyncio.to_thread(vs.find_cached_run, query),
+            timeout=30.0,
+        )
+    except Exception:
+        logger.warning("[Pipeline] Topic cache lookup failed — treating as cache miss", exc_info=True)
+        cached_run_id = None
 
     if cached_run_id:
         logger.info("\n[Cache HIT] Reusing collection from run %s", cached_run_id)
diff --git a/agents/retriever/retriever.py b/agents/retriever/retriever.py
index 67bc543..a4cfc43 100644
--- a/agents/retriever/retriever.py
+++ b/agents/retriever/retriever.py
@@ -300,7 +300,13 @@ async def _run_skill(skill_name: str, queries: list[str]) -> None:
         await asyncio.gather(*[
             _run_skill(name, queries) for name, queries in skill_queries.items()
         ])
-        self.vs.register_run_in_cache(run_id, query)
+        try:
+            await asyncio.wait_for(
+                asyncio.to_thread(self.vs.register_run_in_cache, run_id, query),
+                timeout=30.0,
+            )
+        except Exception:
+            logger.warning("[Retriever] register_run_in_cache failed — skipping cache update", exc_info=True)
         return active_skills
 
     # ------------------------------------------------------------------
diff --git a/api/main.py b/api/main.py
index 51f652b..c99e9d5 100644
--- a/api/main.py
+++ b/api/main.py
@@ -89,9 +89,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
 # ---------------------------------------------------------------------------
 
 # Innermost (runs closest to route handlers)
+# Execution order (reverse of add order): RequestID → Auth → RateLimit → UsageEmitter
+# Auth must run before RateLimit so rate limiting uses per-user keys, not shared IP.
 app.add_middleware(UsageEmitterMiddleware)
-app.add_middleware(AuthMiddleware)
 app.add_middleware(RateLimitMiddleware)
+app.add_middleware(AuthMiddleware)
 app.add_middleware(RequestIDMiddleware)
 
 # CORS — must wrap everything.
diff --git a/db/migrations/versions/0004_research_job_llm_model_id.py b/db/migrations/versions/0004_research_job_llm_model_id.py
index 7b85e83..6c629db 100644
--- a/db/migrations/versions/0004_research_job_llm_model_id.py
+++ b/db/migrations/versions/0004_research_job_llm_model_id.py
@@ -10,7 +10,7 @@
 
 
 revision = "0004"
-down_revision = "0003"
+down_revision = "0003_user_llm"
 branch_labels = None
 depends_on = None
 
diff --git a/db/session.py b/db/session.py
index c260c17..3c99f53 100644
--- a/db/session.py
+++ b/db/session.py
@@ -8,8 +8,10 @@
 
 engine = create_async_engine(
     settings.database_url,
-    pool_size=10,
-    max_overflow=20,
+    pool_size=5,
+    max_overflow=10,
+    pool_pre_ping=True,
+    pool_recycle=300,
     echo=False,
     future=True,
 )
diff --git a/deploy/Caddyfile b/deploy/Caddyfile
new file mode 100644
index 0000000..e834abc
--- /dev/null
+++ b/deploy/Caddyfile
@@ -0,0 +1,46 @@
+# Caddy reverse proxy configuration
+# {$DOMAIN} is read from the DOMAIN env variable in .env
+# Caddy automatically provisions SSL certs via Let's Encrypt.
+
+{$DOMAIN:localhost} {
+    # NextAuth routes — MUST be before /api/* to avoid being caught by FastAPI
+    handle /api/auth/* {
+        reverse_proxy frontend:3000
+    }
+
+    # FastAPI backend routes
+    handle /api/* {
+        reverse_proxy api:8000
+    }
+
+    # Health check endpoints
+    handle /api/health {
+        reverse_proxy api:8000
+    }
+
+    handle /api/ready {
+        reverse_proxy api:8000
+    }
+
+    # Everything else → Next.js frontend
+    handle {
+        reverse_proxy frontend:3000
+    }
+
+    # Compression
+    encode gzip zstd
+
+    # Security headers
+    header {
+        X-Content-Type-Options nosniff
+        X-Frame-Options DENY
+        X-XSS-Protection "1; mode=block"
+        Referrer-Policy strict-origin-when-cross-origin
+    }
+
+    # Logging
+    log {
+        output stdout
+        format console
+    }
+}
diff --git a/deploy/deploy.sh b/deploy/deploy.sh
new file mode 100755
index 0000000..70a9e06
--- /dev/null
+++ b/deploy/deploy.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+# ──────────────────────────────────────────────────────────────────────────────
+# Singularity — first-time deployment script for AWS EC2 (Ubuntu 24.04)
+# Run on the server: bash deploy/deploy.sh
+# ──────────────────────────────────────────────────────────────────────────────
+set -euo pipefail
+
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+info()  { echo -e "${GREEN}[INFO]${NC} $*"; }
+warn()  { echo -e "${YELLOW}[WARN]${NC} $*"; }
+error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
+
+# ── Preflight checks ─────────────────────────────────────────────────────────
+info "Running preflight checks..."
+
+[ -f .env ] || error ".env file not found. Copy .env.production to .env and fill in values."
+grep -q 'YOUR_DOMAIN' .env 2>/dev/null && error "Replace YOUR_DOMAIN in .env with your actual domain."
+grep -qE '^POSTGRES_PASSWORD=\S' .env || error "POSTGRES_PASSWORD must be set in .env."
+grep -qE '^JWT_SECRET=\S' .env || error "JWT_SECRET must be set in .env."
+grep -qE '^NEXTAUTH_SECRET=\S' .env || error "NEXTAUTH_SECRET must be set in .env."
+grep -qE '^NEXTAUTH_URL=\S' .env || error "NEXTAUTH_URL must be set in .env."
+grep -qE '^GROK_API_KEY=\S' .env || warn "GROK_API_KEY not set — you need at least one LLM provider key."
+
+info "All checks passed."
+
+# ── Install Docker if missing ────────────────────────────────────────────────
+if ! command -v docker &>/dev/null; then
+    info "Installing Docker..."
+    curl -fsSL https://get.docker.com | bash
+    sudo usermod -aG docker "$USER"
+    info "Docker installed. You may need to log out and back in for group changes."
+    info "Then re-run this script."
+    exit 0
+fi
+
+# ── Install docker-compose plugin if missing ─────────────────────────────────
+if ! docker compose version &>/dev/null; then
+    info "Installing Docker Compose plugin..."
+    sudo apt-get update -qq
+    sudo apt-get install -y -qq docker-compose-plugin
+fi
+
+# ── Add swap for t3.micro (1GB RAM) ─────────────────────────────────────────
+if [ "$(free -m | awk '/^Swap:/{print $2}')" -lt 1000 ]; then
+    info "Adding 2GB swap file (needed for t3.micro)..."
+    sudo fallocate -l 2G /swapfile
+    sudo chmod 600 /swapfile
+    sudo mkswap /swapfile
+    sudo swapon /swapfile
+    echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
+    sudo sysctl vm.swappiness=10
+    echo 'vm.swappiness=10' | sudo tee -a /etc/sysctl.conf
+    info "Swap configured."
+fi
+
+# ── Configure firewall ───────────────────────────────────────────────────────
+info "Configuring firewall..."
+sudo ufw --force enable
+sudo ufw allow 22/tcp    # SSH
+sudo ufw allow 80/tcp    # HTTP
+sudo ufw allow 443/tcp   # HTTPS
+info "Firewall configured (22, 80, 443 open)."
+
+# ── Build and start ──────────────────────────────────────────────────────────
+info "Building containers (this takes a few minutes on first run)..."
+docker compose -f docker-compose.prod.yml build
+
+info "Starting services..."
+docker compose -f docker-compose.prod.yml up -d
+
+# ── Wait for health checks ──────────────────────────────────────────────────
+info "Waiting for services to become healthy..."
+sleep 15
+
+if docker compose -f docker-compose.prod.yml ps | grep -q "unhealthy"; then
+    error "Some services are unhealthy. Check: docker compose -f docker-compose.prod.yml ps"
+fi
+
+info "Running database migrations..."
+docker compose -f docker-compose.prod.yml exec api alembic upgrade head || \
+    warn "Migration failed or alembic not configured. Check manually."
+
+# ── Done ─────────────────────────────────────────────────────────────────────
+DOMAIN=$(grep -E '^DOMAIN=' .env | cut -d= -f2)
+info "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+info "Deployment complete!"
+info "App: https://${DOMAIN}"
+info ""
+info "Useful commands:"
+info "  View logs:      docker compose -f docker-compose.prod.yml logs -f"
+info "  Restart:        docker compose -f docker-compose.prod.yml restart"
+info "  Stop:           docker compose -f docker-compose.prod.yml down"
+info "  Update & redeploy:"
+info "    git pull && docker compose -f docker-compose.prod.yml build"
+info "    docker compose -f docker-compose.prod.yml up -d"
+info "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
new file mode 100644
index 0000000..05d3190
--- /dev/null
+++ b/docker-compose.prod.yml
@@ -0,0 +1,164 @@
+# Production Docker Compose for AWS EC2 t3.micro (1GB RAM)
+# Uses Qdrant Cloud instead of local Qdrant to save ~42MB RAM
+# Deploy with: docker compose -f docker-compose.prod.yml up -d
+
+services:
+  # ── Reverse proxy with auto-SSL ────────────────────────────────────────────
+  caddy:
+    image: caddy:2-alpine
+    restart: unless-stopped
+    environment:
+      - DOMAIN=${DOMAIN:-localhost}
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - ./deploy/Caddyfile:/etc/caddy/Caddyfile:ro
+      - caddy_data:/data
+      - caddy_config:/config
+    depends_on:
+      api:
+        condition: service_started
+      frontend:
+        condition: service_started
+    deploy:
+      resources:
+        limits:
+          memory: 64M
+
+  # ── Database ───────────────────────────────────────────────────────────────
+  postgres:
+    image: postgres:16-alpine
+    restart: unless-stopped
+    environment:
+      POSTGRES_DB: singularity
+      POSTGRES_USER: ${POSTGRES_USER:-singularity}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?Set POSTGRES_PASSWORD in .env}
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+    expose:
+      - "5432"
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-singularity}"]
+      interval: 5s
+      timeout: 5s
+      retries: 10
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+    command: >
+      postgres
+      -c shared_buffers=64MB
+      -c effective_cache_size=128MB
+      -c work_mem=2MB
+      -c maintenance_work_mem=32MB
+      -c max_connections=50
+
+  # ── Cache & Queue ──────────────────────────────────────────────────────────
+  redis:
+    image: redis:7-alpine
+    restart: unless-stopped
+    expose:
+      - "6379"
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 5s
+      retries: 10
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+    command: >
+      redis-server
+      --maxmemory 96mb
+      --maxmemory-policy allkeys-lru
+      --save 60 1000
+      --appendonly yes
+
+  # ── API Server ─────────────────────────────────────────────────────────────
+  api:
+    image: ${DOCKERHUB_USER:-singularity}/singularity-api:latest
+    restart: unless-stopped
+    expose:
+      - "8000"
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    env_file:
+      - .env
+    environment:
+      - DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-singularity}:${POSTGRES_PASSWORD}@postgres:5432/singularity
+      - REDIS_URL=redis://redis:6379
+      - ENVIRONMENT=production
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+      start_period: 20s
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  # ── Background Worker ──────────────────────────────────────────────────────
+  worker:
+    image: ${DOCKERHUB_USER:-singularity}/singularity-api:latest
+    restart: unless-stopped
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    env_file:
+      - .env
+    environment:
+      - DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-singularity}:${POSTGRES_PASSWORD}@postgres:5432/singularity
+      - REDIS_URL=redis://redis:6379
+      - ENVIRONMENT=production
+    command: python -m workers.main
+    healthcheck:
+      test: ["CMD", "python", "-c", "import redis; r=redis.from_url('redis://redis:6379'); r.ping()"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+    deploy:
+      resources:
+        limits:
+          memory: 1G
+
+  # ── Frontend ───────────────────────────────────────────────────────────────
+  frontend:
+    image: ${DOCKERHUB_USER:-singularity}/singularity-frontend:latest
+    restart: unless-stopped
+    expose:
+      - "3000"
+    env_file:
+      - .env
+    environment:
+      - NEXTAUTH_URL=${NEXTAUTH_URL}
+      - NEXTAUTH_SECRET=${NEXTAUTH_SECRET}
+      - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
+      - INTERNAL_API_URL=http://api:8000
+      - GOOGLE_CLIENT_ID=${GOOGLE_CLIENT_ID}
+      - GOOGLE_CLIENT_SECRET=${GOOGLE_CLIENT_SECRET}
+    healthcheck:
+      test: ["CMD", "node", "-e", "fetch('http://localhost:3000').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+
+volumes:
+  postgres_data:
+  caddy_data:
+  caddy_config:
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
new file mode 100644
index 0000000..28f5b1e
--- /dev/null
+++ b/docker-compose.test.yml
@@ -0,0 +1,112 @@
+# Test compose — simulates t3.micro memory limits locally
+# Usage: docker compose -f docker-compose.test.yml up -d && docker stats --no-stream
+
+services:
+  postgres:
+    image: postgres:16-alpine
+    environment:
+      POSTGRES_DB: singularity
+      POSTGRES_USER: singularity
+      POSTGRES_PASSWORD: loremipsum
+    ports:
+      - "5432:5432"
+    command: >
+      postgres
+      -c shared_buffers=64MB
+      -c effective_cache_size=128MB
+      -c work_mem=2MB
+      -c max_connections=50
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  redis:
+    image: redis:7-alpine
+    ports:
+      - "6379:6379"
+    command: >
+      redis-server
+      --maxmemory 96mb
+      --maxmemory-policy allkeys-lru
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+
+  qdrant:
+    image: qdrant/qdrant:latest
+    ports:
+      - "6333:6333"
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  api:
+    image: hellonish/singularity-api:latest
+    ports:
+      - "8000:8000"
+    depends_on:
+      - postgres
+      - redis
+      - qdrant
+    env_file:
+      - .env
+    environment:
+      - DATABASE_URL=postgresql+asyncpg://singularity:loremipsum@postgres:5432/singularity
+      - REDIS_URL=redis://redis:6379/0
+      - QDRANT_URL=http://qdrant:6333
+      - ENVIRONMENT=production
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')"]
+      interval: 15s
+      timeout: 5s
+      retries: 5
+      start_period: 15s
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  worker:
+    image: hellonish/singularity-api:latest
+    depends_on:
+      - postgres
+      - redis
+      - qdrant
+    env_file:
+      - .env
+    environment:
+      - DATABASE_URL=postgresql+asyncpg://singularity:loremipsum@postgres:5432/singularity
+      - REDIS_URL=redis://redis:6379/0
+      - QDRANT_URL=http://qdrant:6333
+      - ENVIRONMENT=production
+    command: python -m workers.main
+    healthcheck:
+      test: ["CMD", "python", "-c", "import redis; r=redis.from_url('redis://redis:6379/0'); r.ping()"]
+      interval: 15s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  frontend:
+    image: hellonish/singularity-frontend:latest
+    ports:
+      - "3000:3000"
+    env_file:
+      - .env
+    healthcheck:
+      test: ["CMD", "node", "-e", "fetch('http://localhost:3000').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
+      interval: 15s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+    deploy:
+      resources:
+        limits:
+          memory: 128M
diff --git a/frontend/next.config.ts b/frontend/next.config.ts
index 66e1566..fdfc499 100644
--- a/frontend/next.config.ts
+++ b/frontend/next.config.ts
@@ -1,7 +1,7 @@
 import type { NextConfig } from "next";
 
 const nextConfig: NextConfig = {
-  /* config options here */
+  output: "standalone",
   reactCompiler: true,
 };
 
diff --git a/frontend/src/app/dashboard/page.tsx b/frontend/src/app/dashboard/page.tsx
index ce44743..ee0ca61 100644
--- a/frontend/src/app/dashboard/page.tsx
+++ b/frontend/src/app/dashboard/page.tsx
@@ -19,6 +19,7 @@ import { llmModelGroupsFromCatalog } from "@/lib/llm_model_groups";
 import { ChatModelPicker } from "@/components/chat/ChatModelPicker";
 import { showDebugMockResearchControls } from "@/lib/debug_research_mock";
 import { cn } from "@/lib/cn";
+import { ApiError } from "@/lib/api";
 import {
   formatRelative,
   truncateDisplayLabel,
@@ -299,6 +300,8 @@ export default function DashboardPage() {
       ? truncateDisplayLabel(query.trim(), 28)
       : "Research";
 
+  const [createJobError, setCreateJobError] = useState<string | null>(null);
+
   const createJobMutation = useMutation({
     mutationFn: () =>
       jobsApi.create(
@@ -309,11 +312,21 @@ export default function DashboardPage() {
         barModelId,
       ),
     onSuccess: (job) => {
+      setCreateJobError(null);
       setDashThreadId(null);
       setDashLaunch(null);
       setQuery("");
       router.push(`/reports/${job.report_id}?job=${job.job_id}`);
     },
+    onError: (err) => {
+      const message =
+        err instanceof ApiError && err.status === 429
+          ? "Too many requests — please wait a moment and try again."
+          : err instanceof ApiError
+            ? err.message || "Failed to start research job."
+            : "Failed to start research job.";
+      setCreateJobError(message);
+    },
   });
 
   const deleteReportMutation = useMutation({
@@ -680,6 +693,9 @@ export default function DashboardPage() {
             </button>
           </div>
         </form>
+        {createJobError && (
+          <p className="mt-2 text-center text-xs text-red-600">{createJobError}</p>
+        )}
               </div>
             </div>
           )}
diff --git a/frontend/src/app/reports/[id]/page.tsx b/frontend/src/app/reports/[id]/page.tsx
index ff379e2..1d5d6b1 100644
--- a/frontend/src/app/reports/[id]/page.tsx
+++ b/frontend/src/app/reports/[id]/page.tsx
@@ -578,6 +578,8 @@ function ReportJobProgress({
   }, [isRunning, startedAt]);
 
   const runningFor = isRunning ? formatRunningFor(startedAt) : null;
+  const elapsedMs = startedAt ? Date.now() - new Date(startedAt).getTime() : 0;
+  const isStuck = isRunning && elapsedMs > 10 * 60 * 1000;
   const liveLine = phaseStoryboardContext(phase);
 
   return (
@@ -770,6 +772,42 @@ function ReportJobProgress({
             </div>
           )}
 
+          {isStuck && (
+            <div
+              style={{
+                marginTop: 24,
+                borderRadius: 8,
+                background: "rgba(234,179,8,0.06)",
+                border: "1px solid rgba(234,179,8,0.25)",
+                padding: "12px 14px",
+              }}
+            >
+              <div style={{ fontFamily: "var(--mono)", fontSize: 11, color: "#92400e", marginBottom: 4, fontWeight: 500 }}>
+                Taking longer than expected
+              </div>
+              <div style={{ fontFamily: "var(--mono)", fontSize: 11, color: "#b45309", lineHeight: 1.5 }}>
+                The job has been running for {runningFor} with no completion. It may be stuck. You can wait or go back and retry.
+              </div>
+              <button
+                type="button"
+                onClick={onBack}
+                style={{
+                  marginTop: 10,
+                  fontFamily: "var(--mono)",
+                  fontSize: 11,
+                  color: "#374151",
+                  background: "white",
+                  border: "1px solid #e5e7eb",
+                  borderRadius: 6,
+                  padding: "5px 12px",
+                  cursor: "pointer",
+                }}
+              >
+                ← Back to Dashboard
+              </button>
+            </div>
+          )}
+
           {!isFailed && (status === "pending" || status === "running") && activeIdx === -1 && (
             <div
               style={{
diff --git a/requirements.txt b/requirements.txt
index 25e04a6..cc7377e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,43 +1,40 @@
 # ── Core ──────────────────────────────────────────────────────────────────────
-python-dotenv>=1.0.0
-pydantic>=2.7.0
+python-dotenv==1.0.0
+pydantic==2.10.6
 
 # ── LLM clients ───────────────────────────────────────────────────────────────
-openai>=1.0.0            # GrokClient (xAI via OpenAI-compatible SDK)
-google-genai>=1.0.0      # GeminiClient
-deepseek>=1.0.0          # DeepSeekClient
+openai==1.109.1            # GrokClient (xAI via OpenAI-compatible SDK)
+google-genai==1.63.0       # GeminiClient
+deepseek==1.0.0            # DeepSeekClient
 
 # ── Tools — HTTP ──────────────────────────────────────────────────────────────
-aiohttp>=3.9.0           # async HTTP for all REST-based tools
-requests>=2.31.0         # sync HTTP fallback
+aiohttp==3.12.0            # async HTTP for all REST-based tools
+requests==2.32.3           # sync HTTP fallback
 
 # ── Tools — Web search ────────────────────────────────────────────────────────
-ddgs>=6.0.0               # WebFetchTool + StandardsFetchTool (free, no key)
-tavily-python>=0.3.0      # WebFetchTool fallback (TAVILY_API_KEY)
-
-# ── Tools — Translation ────────────────────────────────────────────────────────
-# TranslationTool primary: MyMemory API (free, no key, uses aiohttp — already listed)
-# TranslationTool fallback: Google Translate (GOOGLE_TRANSLATE_API_KEY)
+ddgs==9.11.4               # WebFetchTool + StandardsFetchTool (free, no key)
+tavily-python==0.7.3       # WebFetchTool fallback (TAVILY_API_KEY)
 
 # ── Tools — Academic ──────────────────────────────────────────────────────────
-arxiv>=2.1.0              # ArxivTool (free, no key)
-biopython>=1.84           # PubMedTool via NCBI Entrez (free, needs NCBI_EMAIL)
+arxiv==2.2.0               # ArxivTool (free, no key)
+biopython==1.86            # PubMedTool via NCBI Entrez (free, needs NCBI_EMAIL)
 
 # ── Tools — Code & data ───────────────────────────────────────────────────────
-PyGithub>=2.3.0           # GitHubTool (GITHUB_TOKEN optional)
-huggingface-hub>=0.23.0   # DatasetHubTool (free, no key)
+PyGithub==2.9.0            # GitHubTool (GITHUB_TOKEN optional)
+huggingface-hub>=0.20.0     # DatasetHubTool (free, no key)
 
 # ── Tools — Video ─────────────────────────────────────────────────────────────
-youtube-transcript-api>=0.6.2  # YouTubeTranscriptTool (free, no key)
+youtube-transcript-api==1.0.3   # YouTubeTranscriptTool (free, no key)
 
 # ── Tools — PDF ───────────────────────────────────────────────────────────────
-pdfplumber>=0.11.0        # PdfReaderTool primary
-PyMuPDF>=1.24.0           # PdfReaderTool fallback for scanned/broken PDFs
+pdfplumber==0.11.9         # PdfReaderTool primary
+PyMuPDF==1.26.0            # PdfReaderTool fallback for scanned/broken PDFs
 
 # ── Vector store (research worker / pipeline) ────────────────────────────────
 # Keep in sync with the Qdrant server image in docker-compose.yml (major.minor).
-qdrant-client>=1.14.2,<1.15
+qdrant-client==1.14.2
+fastembed==0.6.1  # local embeddings for Qdrant (all-MiniLM-L6-v2, ONNX — no PyTorch)
 
 # ── Tests ─────────────────────────────────────────────────────────────────────
-pytest>=8.0.0
-pytest-asyncio>=0.23.0
+pytest==8.4.0
+pytest-asyncio==0.26.0
diff --git a/requirements_api.txt b/requirements_api.txt
index 5bfcbe1..7bc3130 100644
--- a/requirements_api.txt
+++ b/requirements_api.txt
@@ -1,22 +1,21 @@
-fastapi>=0.111
-uvicorn[standard]>=0.29
-sqlalchemy[asyncio]>=2.0
-alembic>=1.13
-asyncpg>=0.29
-pydantic-settings>=2.2
-arq>=0.25
-redis>=5.0
-httpx>=0.27
-google-auth>=2.29
-authlib>=1.3
-python-jose[cryptography]>=3.3
-cryptography>=42.0
-python-multipart>=0.0.9
-aioboto3>=12.3
-markdown>=3.6
-user-agents>=2.2.0
-sentry-sdk>=2.0
-opentelemetry-api>=1.24
-opentelemetry-sdk>=1.24
-pytest-asyncio>=0.23
-anyio>=4.3
+fastapi==0.115.12
+uvicorn[standard]==0.34.2
+sqlalchemy[asyncio]==2.0.41
+alembic==1.18.4
+asyncpg==0.31.0
+pydantic-settings==2.9.1
+arq==0.27.0
+redis==5.3.1
+httpx==0.28.1
+google-auth==2.48.0
+Authlib==1.6.6
+python-jose[cryptography]==3.5.0
+cryptography==46.0.3
+python-multipart==0.0.20
+aioboto3==15.5.0
+user-agents==2.2.0
+sentry-sdk==2.57.0
+opentelemetry-api==1.38.0
+opentelemetry-sdk==1.38.0
+pytest-asyncio==0.26.0
+anyio==4.9.0
diff --git a/test_pipeline_local.py b/test_pipeline_local.py
new file mode 100644
index 0000000..0c4dda6
--- /dev/null
+++ b/test_pipeline_local.py
@@ -0,0 +1,63 @@
+"""
+Quick local smoke-test for the research pipeline at strength=1.
+Run from repo root:
+    python test_pipeline_local.py
+"""
+import asyncio
+import logging
+import os
+import sys
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    stream=sys.stdout,
+)
+
+# Load .env so QDRANT_*, TAVILY_*, etc. are available
+from dotenv import load_dotenv
+load_dotenv()
+
+# Force in-memory Qdrant to avoid needing Qdrant Cloud locally
+os.environ.setdefault("QDRANT_FORCE_IN_MEMORY", "1")
+
+from agents.orchestrator.pipeline import run_pipeline
+
+
+async def main() -> None:
+    query = "What is dark matter?"
+    model_id = "grok-3-mini-fast"
+    llm_api_key = os.environ["GROK_API_KEY"]
+
+    print(f"\n{'='*60}")
+    print(f"Query   : {query}")
+    print(f"Model   : {model_id}")
+    print(f"Strength: 1 (minimal)")
+    print(f"{'='*60}\n")
+
+    async def on_phase(phase: str, description: str) -> None:
+        print(f"\n[PHASE] {phase}: {description}")
+
+    async def on_activity(activity: dict) -> None:
+        kind = activity.get("kind", "")
+        meta = activity.get("meta", {})
+        print(f"  [ACT] {kind} {meta}")
+
+    markdown = await run_pipeline(
+        query=query,
+        strength=1,
+        on_phase=on_phase,
+        on_activity=on_activity,
+        model_id=model_id,
+        llm_api_key=llm_api_key,
+    )
+
+    print(f"\n{'='*60}")
+    print("REPORT (first 2000 chars):")
+    print(f"{'='*60}")
+    print(markdown[:2000])
+    print(f"\n[Total length: {len(markdown)} chars]")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tools/base.py b/tools/base.py
index 7710e52..1121412 100644
--- a/tools/base.py
+++ b/tools/base.py
@@ -47,16 +47,25 @@ async def call_with_retry(
         self,
         query: str,
         max_retries: int = 2,
+        timeout: float = 60.0,
         **kwargs,
     ) -> ToolResult:
         """
         Retries call() with exponential back-off (1s, 2s, ...).
         Never raises — returns ToolResult.failure() after exhausting retries.
+        Each attempt is bounded by `timeout` seconds (default 60s).
         """
         last_error = ""
         for attempt in range(max_retries + 1):
             try:
-                return await self.call(query, **kwargs)
+                return await asyncio.wait_for(
+                    self.call(query, **kwargs),
+                    timeout=timeout,
+                )
+            except asyncio.TimeoutError:
+                last_error = f"timed out after {timeout}s"
+                if attempt < max_retries:
+                    await asyncio.sleep(2 ** attempt)
             except Exception as exc:
                 last_error = str(exc)
                 if attempt < max_retries:
diff --git a/tools/pdf_reader.py b/tools/pdf_reader.py
index 6ab6d7a..3747ebd 100644
--- a/tools/pdf_reader.py
+++ b/tools/pdf_reader.py
@@ -117,7 +117,8 @@ async def call(self, query: str, url: str | None = None, data: bytes | None = No
         if data is None:
             if not url:
                 raise ValueError("PdfReaderTool requires either `url` or `data`")
-            async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_ctx())) as session:
+            timeout = aiohttp.ClientTimeout(total=30)
+            async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_ctx()), timeout=timeout) as session:
                 async with session.get(url) as resp:
                     resp.raise_for_status()
                     data = await resp.read()
diff --git a/vector_store/client.py b/vector_store/client.py
index aaf2925..0aebef2 100644
--- a/vector_store/client.py
+++ b/vector_store/client.py
@@ -37,7 +37,7 @@
 )
 from .embedder import Embedder
 
-_QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+_QDRANT_URL = os.getenv("QDRANT_URL") or os.getenv("QDRANT_LOCATION") or "http://localhost:6333"
 
 _UPSERT_MAX_RETRIES = 3
 _UPSERT_RETRY_DELAY = 1.0   # seconds between retries
@@ -126,7 +126,7 @@ def qdrant(self):
 
     def create_collection(self, run_id: str) -> str:
         """Create a fresh collection for this run. Returns the collection name."""
-        from qdrant_client.models import VectorParams, Distance
+        from qdrant_client.models import VectorParams, Distance, PayloadSchemaType
         name = f"run_{run_id}"
         self.qdrant.recreate_collection(
             collection_name=name,
@@ -135,6 +135,11 @@ def create_collection(self, run_id: str) -> str:
                 distance=Distance.COSINE,
             ),
         )
+        # Qdrant Cloud requires explicit payload indexes for filtering.
+        self.qdrant.create_payload_index(
+            collection_name=name, field_name="credibility",
+            field_schema=PayloadSchemaType.FLOAT,
+        )
         return name
 
     def delete_collection(self, run_id: str) -> None:
diff --git a/vector_store/embedder.py b/vector_store/embedder.py
index 43d4176..cccc104 100644
--- a/vector_store/embedder.py
+++ b/vector_store/embedder.py
@@ -1,6 +1,7 @@
 """
-Embedder — wraps sentence-transformers/all-MiniLM-L6-v2 (local, free, 384-dim).
+Embedder — wraps fastembed/all-MiniLM-L6-v2 (local, free, 384-dim).
 
+Uses ONNX runtime instead of PyTorch — ~10× smaller install footprint.
 Lazy-loads the model on first use so import doesn't pay the load cost.
 Thread-safe: model is loaded once and reused across all calls.
 
@@ -15,23 +16,23 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from sentence_transformers import SentenceTransformer
+    from fastembed import TextEmbedding
 
 _CHUNK_SIZE_CHARS  = 2000   # ≈ 512 tokens
 _CHUNK_OVERLAP_CHARS = 256  # ≈ 64 tokens
 _MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 
 _lock:  threading.Lock = threading.Lock()
-_model: "SentenceTransformer | None" = None
+_model: "TextEmbedding | None" = None
 
 
-def _get_model() -> "SentenceTransformer":
+def _get_model() -> "TextEmbedding":
     global _model
     if _model is None:
         with _lock:
             if _model is None:
-                from sentence_transformers import SentenceTransformer
-                _model = SentenceTransformer(_MODEL_NAME)
+                from fastembed import TextEmbedding
+                _model = TextEmbedding(_MODEL_NAME)
     return _model
 
 
@@ -41,14 +42,12 @@ class Embedder:
     def embed(self, text: str) -> list[float]:
         """Embed a single string. Returns a 384-dim float list."""
         model = _get_model()
-        vec = model.encode(text, normalize_embeddings=True)
-        return vec.tolist()
+        return next(model.embed([text])).tolist()
 
     def embed_batch(self, texts: list[str]) -> list[list[float]]:
         """Embed multiple strings in one forward pass."""
         model = _get_model()
-        vecs = model.encode(texts, normalize_embeddings=True, batch_size=32)
-        return [v.tolist() for v in vecs]
+        return [vec.tolist() for vec in model.embed(texts, batch_size=32)]
 
     def chunk_text(self, text: str) -> list[str]:
         """
diff --git a/workers/main.py b/workers/main.py
index 5a23a59..b1be248 100644
--- a/workers/main.py
+++ b/workers/main.py
@@ -1,10 +1,16 @@
 from __future__ import annotations
 
+import json
 import logging
+import uuid
+from datetime import datetime, timezone
 
 from arq.connections import RedisSettings
+from sqlalchemy import select
 
 from api.config import settings
+from db.models import ResearchJob
+from db.session import AsyncSessionLocal
 from workers.patch_job import run_patch_job
 from workers.research_job import run_debug_mock_research_job, run_research_job
 from workers.summary_job import run_summary_job
@@ -12,6 +18,42 @@
 logger = logging.getLogger(__name__)
 
 
+async def _recover_orphaned_jobs(redis) -> None:
+    """
+    On worker startup, any job still in 'running' or 'pending' state was
+    interrupted by a previous crash. Mark them failed and publish job_error
+    so the frontend SSE stream or polling fallback surfaces the error immediately.
+    """
+    async with AsyncSessionLocal() as db:
+        result = await db.execute(
+            select(ResearchJob).where(ResearchJob.status.in_(["running", "pending"]))
+        )
+        orphans = result.scalars().all()
+
+        if not orphans:
+            return
+
+        for job in orphans:
+            job.status = "failed"
+            job.finished_at = datetime.now(timezone.utc)
+            job.error_detail = "Worker process restarted unexpectedly. Please retry."
+        await db.commit()
+
+        for job in orphans:
+            payload = json.dumps({
+                "event": "job_error",
+                "data": {
+                    "status": "failed",
+                    "error": "Worker process restarted unexpectedly. Please retry.",
+                    "attempts": job.attempts or 1,
+                    "elapsed_ms": 0,
+                },
+                "id": str(uuid.uuid4()),
+            })
+            await redis.publish(f"job:{job.id}:events", payload)
+            logger.warning("Recovered orphaned job %s → failed", job.id)
+
+
 class WorkerSettings:
     """ARQ worker configuration."""
 
@@ -41,6 +83,7 @@ async def on_startup(ctx: dict) -> None:
         )
         ctx["redis"] = redis
         logger.info("ARQ worker started — redis: %s", settings.redis_url)
+        await _recover_orphaned_jobs(redis)
 
     @staticmethod
     async def on_shutdown(ctx: dict) -> None: