Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 76 additions & 17 deletions cmd/engine/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,24 +196,41 @@ func run() error {
}
q.Register(queue.KindIngestDocument, pipeline.Handler())

// /v1/answer/pageindex gets its OWN PageIndexStrategy instance,
// independent of whatever selection strategy is configured in
// retrieval.strategy. This way the endpoint is always available
// (gated by retrieval.pageindex.enabled), even on a deployment
// using chunked-tree as its default selection path.
var pageIndexStrategy *retrieval.PageIndexStrategy
if cfg.Retrieval.PageIndex.Enabled && llmClient != nil {
pageIndexStrategy = buildPageIndexStrategy(cfg.Retrieval, llmClient, store)
logger.Info("retrieval: pageindex answer endpoint enabled",
"max_hops", pageIndexStrategy.MaxHops,
"page_content_limit", pageIndexStrategy.PageContentLimit,
"model_override", cfg.Retrieval.PageIndex.Model,
)
}

deps := api.Deps{
Logger: logger,
DB: pool,
Storage: store,
Queue: q,
Strategy: strategy,
Version: version,
MultiDoc: multiDoc,
LLM: llmClient,
LLMModel: modelFor(cfg.LLM),
AnswerSpan: cfg.Retrieval.AnswerSpan,
Answer: cfg.Retrieval.Answer,
Planner: planner,
Planning: cfg.Retrieval.Planning,
ReRanker: reRanker,
ReRank: cfg.Retrieval.ReRank,
Replay: replayStore,
Abstain: cfg.Retrieval.Abstain,
Logger: logger,
DB: pool,
Storage: store,
Queue: q,
Strategy: strategy,
Version: version,
MultiDoc: multiDoc,
LLM: llmClient,
LLMModel: modelFor(cfg.LLM),
AnswerSpan: cfg.Retrieval.AnswerSpan,
Answer: cfg.Retrieval.Answer,
Planner: planner,
Planning: cfg.Retrieval.Planning,
ReRanker: reRanker,
ReRank: cfg.Retrieval.ReRank,
Replay: replayStore,
Abstain: cfg.Retrieval.Abstain,
PageIndexStrategy: pageIndexStrategy,
PageIndex: cfg.Retrieval.PageIndex,
}

srv := &http.Server{
Expand Down Expand Up @@ -365,11 +382,36 @@ func buildStrategy(c config.RetrievalConfig, client llmgate.Client, store storag
}
a.ModelOverride = c.Agentic.Model
return a
case "pageindex":
return buildPageIndexStrategy(c, client, store)
default:
return retrieval.NewChunkedTree(client)
}
}

// buildPageIndexStrategy constructs the page-based agentic
// strategy with the storage-backed PageLoader and the configured
// caps. Used by buildStrategy when retrieval.strategy=pageindex AND
// by the /v1/answer/pageindex endpoint setup (which wires its own
// instance regardless of the selection strategy).
//
// The TOCProvider is left nil here. PR-A (toc-tree-builder) adds
// documents.toc_tree + a DB-backed provider; until it lands the
// strategy degrades to its synthesised view, which is the
// documented fallback path.
func buildPageIndexStrategy(c config.RetrievalConfig, client llmgate.Client, store storage.Storage) *retrieval.PageIndexStrategy {
p := retrieval.NewPageIndexStrategy(client)
p.PageLoader = storagePageLoader{s: store}
if c.PageIndex.MaxHops > 0 {
p.MaxHops = c.PageIndex.MaxHops
}
if c.PageIndex.PageContentLimit > 0 {
p.PageContentLimit = c.PageIndex.PageContentLimit
}
p.ModelOverride = c.PageIndex.Model
return p
}

// storageFetcher adapts a storage.Storage to retrieval.ContentFetcher.
// The agentic strategy reads section bodies one at a time, so we
// materialize the full reader contents into a []byte here rather than
Expand All @@ -385,6 +427,23 @@ func (sf storageFetcher) Get(ctx context.Context, ref string) ([]byte, error) {
return io.ReadAll(rc)
}

// storagePageLoader adapts a storage.Storage to
// retrieval.PageContentLoader. Mirrors storageFetcher but lives
// behind a separate interface so the two callers (agentic /
// pageindex) can be wired independently. The PageIndex strategy
// materialises section bodies once per get_pages observation, so
// reading the full reader into a []byte is the right shape.
type storagePageLoader struct{ s storage.Storage }

func (l storagePageLoader) Load(ctx context.Context, ref string) ([]byte, error) {
rc, _, err := l.s.Get(ctx, ref)
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}

// buildTLSConfig returns a *tls.Config when direct TLS is enabled, or nil
// when the engine should serve plaintext (behind a proxy). Returning nil
// leaves http.Server's TLSConfig unset, which is exactly what ListenAndServe
Expand Down
63 changes: 62 additions & 1 deletion config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,20 @@ llm:
reasoning_model: "gemini-2.5-pro"

retrieval:
# strategy: single-pass | chunked-tree
# strategy: single-pass | chunked-tree | agentic | pageindex
#
# single-pass: whole tree in one LLM call; fastest, smallest docs.
# chunked-tree: split the tree, reason over slices in parallel, merge.
# The default. Scales to any tree size by trading
# context for parallelism.
# agentic: iterative outline → expand → read → done loop.
# Picks per-section IDs via a tool-using model.
# pageindex: PageIndex-style page-based agentic loop. Three
# tools (get_document_structure / get_pages / done);
# the model navigates by INCLUSIVE PAGE RANGE
# rather than by section ID. Best for paginated
# documents (SEC filings, academic PDFs) where the
# per-section interface is too noisy.
strategy: "chunked-tree"

chunked_tree:
Expand Down Expand Up @@ -232,6 +245,54 @@ retrieval:
# audit flows may bump this; tight memory budgets shrink it.
ttl_seconds: 86400

# pageindex: PageIndex-style page-based agentic strategy and its
# dedicated POST /v1/answer/pageindex endpoint.
#
# The strategy runs a three-tool loop:
# 1. get_document_structure() — returns the TOC tree (titles +
# page ranges, no body text).
# 2. get_pages(start_page, end_page) — returns the concatenated
# content of every section whose page range overlaps.
# 3. done(answer, cited_pages, reasoning) — terminates with the
# natural-language answer plus the cited inclusive ranges.
#
# Unlike /v1/answer there's no separate synthesis call — the
# model emits the final answer inside the done tool call. The
# response carries per-page-range citations with answer-span
# quotes, a deterministic trace_token (replayable via
# /v1/replay), and an optional reasoning_trace describing every
# tool call. Streaming via SSE is available with `stream:true`
# on the request body — one event per tool call so callers
# watch the navigation in real time.
#
# OPT-OUT. Default enabled. Disable to unwire the endpoint
# (returns 501); the strategy itself can still be selected by
# setting `retrieval.strategy: pageindex` even when this block
# is disabled.
#
# Works WITHOUT a persisted TOC tree (pre-PR-A state) — the
# strategy synthesises a TOC view from the section list when
# documents.toc_tree is NULL. No request fails because of a
# missing TOC.
pageindex:
enabled: true
# Cap on LLM turns per request, including the terminal done
# turn. The reference PageIndex demo converges in 3-5 hops on
# typical questions; 8 leaves buffer for retries on parse
# failures and the occasional extra get_pages call.
max_hops: 8
# Cap on chars one get_pages tool call returns. 16,000 ≈ 4K
# tokens — enough for a 5-7 page excerpt, well under any
# flagship model's context window. Higher values risk burning
# context budget on stray full-document fetches.
page_content_limit: 16000
# Override the navigation-loop model; empty inherits the
# request's model (which itself falls back to the engine
# default). Most deployments leave this blank — navigation
# and answer happen in the same loop, so a "small model for
# navigation, large for answer" split doesn't apply.
model: ""

ingest:
# The summarize and HyDE stages run concurrently. This caps the total
# number of LLM calls in flight across both stages combined, so the
Expand Down
Loading
Loading