hallelx2 · hallelx2 · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/cmd/engine/main.go b/cmd/engine/main.go
@@ -196,24 +196,41 @@ func run() error {
 	}
 	q.Register(queue.KindIngestDocument, pipeline.Handler())
 
+	// /v1/answer/pageindex gets its OWN PageIndexStrategy instance,
+	// independent of whatever selection strategy is configured in
+	// retrieval.strategy. This way the endpoint is always available
+	// (gated by retrieval.pageindex.enabled), even on a deployment
+	// using chunked-tree as its default selection path.
+	var pageIndexStrategy *retrieval.PageIndexStrategy
+	if cfg.Retrieval.PageIndex.Enabled && llmClient != nil {
+		pageIndexStrategy = buildPageIndexStrategy(cfg.Retrieval, llmClient, store)
+		logger.Info("retrieval: pageindex answer endpoint enabled",
+			"max_hops", pageIndexStrategy.MaxHops,
+			"page_content_limit", pageIndexStrategy.PageContentLimit,
+			"model_override", cfg.Retrieval.PageIndex.Model,
+		)
+	}
+
 	deps := api.Deps{
-		Logger:     logger,
-		DB:         pool,
-		Storage:    store,
-		Queue:      q,
-		Strategy:   strategy,
-		Version:    version,
-		MultiDoc:   multiDoc,
-		LLM:        llmClient,
-		LLMModel:   modelFor(cfg.LLM),
-		AnswerSpan: cfg.Retrieval.AnswerSpan,
-		Answer:     cfg.Retrieval.Answer,
-		Planner:    planner,
-		Planning:   cfg.Retrieval.Planning,
-		ReRanker:   reRanker,
-		ReRank:     cfg.Retrieval.ReRank,
-		Replay:     replayStore,
-		Abstain:    cfg.Retrieval.Abstain,
+		Logger:            logger,
+		DB:                pool,
+		Storage:           store,
+		Queue:             q,
+		Strategy:          strategy,
+		Version:           version,
+		MultiDoc:          multiDoc,
+		LLM:               llmClient,
+		LLMModel:          modelFor(cfg.LLM),
+		AnswerSpan:        cfg.Retrieval.AnswerSpan,
+		Answer:            cfg.Retrieval.Answer,
+		Planner:           planner,
+		Planning:          cfg.Retrieval.Planning,
+		ReRanker:          reRanker,
+		ReRank:            cfg.Retrieval.ReRank,
+		Replay:            replayStore,
+		Abstain:           cfg.Retrieval.Abstain,
+		PageIndexStrategy: pageIndexStrategy,
+		PageIndex:         cfg.Retrieval.PageIndex,
 	}
 
 	srv := &http.Server{
@@ -365,11 +382,36 @@ func buildStrategy(c config.RetrievalConfig, client llmgate.Client, store storag
 		}
 		a.ModelOverride = c.Agentic.Model
 		return a
+	case "pageindex":
+		return buildPageIndexStrategy(c, client, store)
 	default:
 		return retrieval.NewChunkedTree(client)
 	}
 }
 
+// buildPageIndexStrategy constructs the page-based agentic
+// strategy with the storage-backed PageLoader and the configured
+// caps. Used by buildStrategy when retrieval.strategy=pageindex AND
+// by the /v1/answer/pageindex endpoint setup (which wires its own
+// instance regardless of the selection strategy).
+//
+// The TOCProvider is left nil here. PR-A (toc-tree-builder) adds
+// documents.toc_tree + a DB-backed provider; until it lands the
+// strategy degrades to its synthesised view, which is the
+// documented fallback path.
+func buildPageIndexStrategy(c config.RetrievalConfig, client llmgate.Client, store storage.Storage) *retrieval.PageIndexStrategy {
+	p := retrieval.NewPageIndexStrategy(client)
+	p.PageLoader = storagePageLoader{s: store}
+	if c.PageIndex.MaxHops > 0 {
+		p.MaxHops = c.PageIndex.MaxHops
+	}
+	if c.PageIndex.PageContentLimit > 0 {
+		p.PageContentLimit = c.PageIndex.PageContentLimit
+	}
+	p.ModelOverride = c.PageIndex.Model
+	return p
+}
+
 // storageFetcher adapts a storage.Storage to retrieval.ContentFetcher.
 // The agentic strategy reads section bodies one at a time, so we
 // materialize the full reader contents into a []byte here rather than
@@ -385,6 +427,23 @@ func (sf storageFetcher) Get(ctx context.Context, ref string) ([]byte, error) {
 	return io.ReadAll(rc)
 }
 
+// storagePageLoader adapts a storage.Storage to
+// retrieval.PageContentLoader. Mirrors storageFetcher but lives
+// behind a separate interface so the two callers (agentic /
+// pageindex) can be wired independently. The PageIndex strategy
+// materialises section bodies once per get_pages observation, so
+// reading the full reader into a []byte is the right shape.
+type storagePageLoader struct{ s storage.Storage }
+
+func (l storagePageLoader) Load(ctx context.Context, ref string) ([]byte, error) {
+	rc, _, err := l.s.Get(ctx, ref)
+	if err != nil {
+		return nil, err
+	}
+	defer rc.Close()
+	return io.ReadAll(rc)
+}
+
 // buildTLSConfig returns a *tls.Config when direct TLS is enabled, or nil
 // when the engine should serve plaintext (behind a proxy). Returning nil
 // leaves http.Server's TLSConfig unset, which is exactly what ListenAndServe

diff --git a/config.example.yaml b/config.example.yaml
@@ -95,7 +95,20 @@ llm:
     reasoning_model: "gemini-2.5-pro"
 
 retrieval:
-  # strategy: single-pass | chunked-tree
+  # strategy: single-pass | chunked-tree | agentic | pageindex
+  #
+  #   single-pass:  whole tree in one LLM call; fastest, smallest docs.
+  #   chunked-tree: split the tree, reason over slices in parallel, merge.
+  #                 The default. Scales to any tree size by trading
+  #                 context for parallelism.
+  #   agentic:      iterative outline → expand → read → done loop.
+  #                 Picks per-section IDs via a tool-using model.
+  #   pageindex:    PageIndex-style page-based agentic loop. Three
+  #                 tools (get_document_structure / get_pages / done);
+  #                 the model navigates by INCLUSIVE PAGE RANGE
+  #                 rather than by section ID. Best for paginated
+  #                 documents (SEC filings, academic PDFs) where the
+  #                 per-section interface is too noisy.
   strategy: "chunked-tree"
 
   chunked_tree:
@@ -232,6 +245,54 @@ retrieval:
     # audit flows may bump this; tight memory budgets shrink it.
     ttl_seconds: 86400
 
+  # pageindex: PageIndex-style page-based agentic strategy and its
+  # dedicated POST /v1/answer/pageindex endpoint.
+  #
+  # The strategy runs a three-tool loop:
+  #   1. get_document_structure() — returns the TOC tree (titles +
+  #      page ranges, no body text).
+  #   2. get_pages(start_page, end_page) — returns the concatenated
+  #      content of every section whose page range overlaps.
+  #   3. done(answer, cited_pages, reasoning) — terminates with the
+  #      natural-language answer plus the cited inclusive ranges.
+  #
+  # Unlike /v1/answer there's no separate synthesis call — the
+  # model emits the final answer inside the done tool call. The
+  # response carries per-page-range citations with answer-span
+  # quotes, a deterministic trace_token (replayable via
+  # /v1/replay), and an optional reasoning_trace describing every
+  # tool call. Streaming via SSE is available with `stream:true`
+  # on the request body — one event per tool call so callers
+  # watch the navigation in real time.
+  #
+  # OPT-OUT. Default enabled. Disable to unwire the endpoint
+  # (returns 501); the strategy itself can still be selected by
+  # setting `retrieval.strategy: pageindex` even when this block
+  # is disabled.
+  #
+  # Works WITHOUT a persisted TOC tree (pre-PR-A state) — the
+  # strategy synthesises a TOC view from the section list when
+  # documents.toc_tree is NULL. No request fails because of a
+  # missing TOC.
+  pageindex:
+    enabled: true
+    # Cap on LLM turns per request, including the terminal done
+    # turn. The reference PageIndex demo converges in 3-5 hops on
+    # typical questions; 8 leaves buffer for retries on parse
+    # failures and the occasional extra get_pages call.
+    max_hops: 8
+    # Cap on chars one get_pages tool call returns. 16,000 ≈ 4K
+    # tokens — enough for a 5-7 page excerpt, well under any
+    # flagship model's context window. Higher values risk burning
+    # context budget on stray full-document fetches.
+    page_content_limit: 16000
+    # Override the navigation-loop model; empty inherits the
+    # request's model (which itself falls back to the engine
+    # default). Most deployments leave this blank — navigation
+    # and answer happen in the same loop, so a "small model for
+    # navigation, large for answer" split doesn't apply.
+    model: ""
+
 ingest:
   # The summarize and HyDE stages run concurrently. This caps the total
   # number of LLM calls in flight across both stages combined, so the