diff --git a/cmd/engine/main.go b/cmd/engine/main.go index cade7fa..5e1968f 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -196,24 +196,41 @@ func run() error { } q.Register(queue.KindIngestDocument, pipeline.Handler()) + // /v1/answer/pageindex gets its OWN PageIndexStrategy instance, + // independent of whatever selection strategy is configured in + // retrieval.strategy. This way the endpoint is always available + // (gated by retrieval.pageindex.enabled), even on a deployment + // using chunked-tree as its default selection path. + var pageIndexStrategy *retrieval.PageIndexStrategy + if cfg.Retrieval.PageIndex.Enabled && llmClient != nil { + pageIndexStrategy = buildPageIndexStrategy(cfg.Retrieval, llmClient, store) + logger.Info("retrieval: pageindex answer endpoint enabled", + "max_hops", pageIndexStrategy.MaxHops, + "page_content_limit", pageIndexStrategy.PageContentLimit, + "model_override", cfg.Retrieval.PageIndex.Model, + ) + } + deps := api.Deps{ - Logger: logger, - DB: pool, - Storage: store, - Queue: q, - Strategy: strategy, - Version: version, - MultiDoc: multiDoc, - LLM: llmClient, - LLMModel: modelFor(cfg.LLM), - AnswerSpan: cfg.Retrieval.AnswerSpan, - Answer: cfg.Retrieval.Answer, - Planner: planner, - Planning: cfg.Retrieval.Planning, - ReRanker: reRanker, - ReRank: cfg.Retrieval.ReRank, - Replay: replayStore, - Abstain: cfg.Retrieval.Abstain, + Logger: logger, + DB: pool, + Storage: store, + Queue: q, + Strategy: strategy, + Version: version, + MultiDoc: multiDoc, + LLM: llmClient, + LLMModel: modelFor(cfg.LLM), + AnswerSpan: cfg.Retrieval.AnswerSpan, + Answer: cfg.Retrieval.Answer, + Planner: planner, + Planning: cfg.Retrieval.Planning, + ReRanker: reRanker, + ReRank: cfg.Retrieval.ReRank, + Replay: replayStore, + Abstain: cfg.Retrieval.Abstain, + PageIndexStrategy: pageIndexStrategy, + PageIndex: cfg.Retrieval.PageIndex, } srv := &http.Server{ @@ -365,11 +382,36 @@ func buildStrategy(c config.RetrievalConfig, client llmgate.Client, store storag } a.ModelOverride = c.Agentic.Model return a + case "pageindex": + return buildPageIndexStrategy(c, client, store) default: return retrieval.NewChunkedTree(client) } } +// buildPageIndexStrategy constructs the page-based agentic +// strategy with the storage-backed PageLoader and the configured +// caps. Used by buildStrategy when retrieval.strategy=pageindex AND +// by the /v1/answer/pageindex endpoint setup (which wires its own +// instance regardless of the selection strategy). +// +// The TOCProvider is left nil here. PR-A (toc-tree-builder) adds +// documents.toc_tree + a DB-backed provider; until it lands the +// strategy degrades to its synthesised view, which is the +// documented fallback path. +func buildPageIndexStrategy(c config.RetrievalConfig, client llmgate.Client, store storage.Storage) *retrieval.PageIndexStrategy { + p := retrieval.NewPageIndexStrategy(client) + p.PageLoader = storagePageLoader{s: store} + if c.PageIndex.MaxHops > 0 { + p.MaxHops = c.PageIndex.MaxHops + } + if c.PageIndex.PageContentLimit > 0 { + p.PageContentLimit = c.PageIndex.PageContentLimit + } + p.ModelOverride = c.PageIndex.Model + return p +} + // storageFetcher adapts a storage.Storage to retrieval.ContentFetcher. // The agentic strategy reads section bodies one at a time, so we // materialize the full reader contents into a []byte here rather than @@ -385,6 +427,23 @@ func (sf storageFetcher) Get(ctx context.Context, ref string) ([]byte, error) { return io.ReadAll(rc) } +// storagePageLoader adapts a storage.Storage to +// retrieval.PageContentLoader. Mirrors storageFetcher but lives +// behind a separate interface so the two callers (agentic / +// pageindex) can be wired independently. The PageIndex strategy +// materialises section bodies once per get_pages observation, so +// reading the full reader into a []byte is the right shape. +type storagePageLoader struct{ s storage.Storage } + +func (l storagePageLoader) Load(ctx context.Context, ref string) ([]byte, error) { + rc, _, err := l.s.Get(ctx, ref) + if err != nil { + return nil, err + } + defer rc.Close() + return io.ReadAll(rc) +} + // buildTLSConfig returns a *tls.Config when direct TLS is enabled, or nil // when the engine should serve plaintext (behind a proxy). Returning nil // leaves http.Server's TLSConfig unset, which is exactly what ListenAndServe diff --git a/config.example.yaml b/config.example.yaml index 7de0368..a7f2579 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -95,7 +95,20 @@ llm: reasoning_model: "gemini-2.5-pro" retrieval: - # strategy: single-pass | chunked-tree + # strategy: single-pass | chunked-tree | agentic | pageindex + # + # single-pass: whole tree in one LLM call; fastest, smallest docs. + # chunked-tree: split the tree, reason over slices in parallel, merge. + # The default. Scales to any tree size by trading + # context for parallelism. + # agentic: iterative outline → expand → read → done loop. + # Picks per-section IDs via a tool-using model. + # pageindex: PageIndex-style page-based agentic loop. Three + # tools (get_document_structure / get_pages / done); + # the model navigates by INCLUSIVE PAGE RANGE + # rather than by section ID. Best for paginated + # documents (SEC filings, academic PDFs) where the + # per-section interface is too noisy. strategy: "chunked-tree" chunked_tree: @@ -232,6 +245,54 @@ retrieval: # audit flows may bump this; tight memory budgets shrink it. ttl_seconds: 86400 + # pageindex: PageIndex-style page-based agentic strategy and its + # dedicated POST /v1/answer/pageindex endpoint. + # + # The strategy runs a three-tool loop: + # 1. get_document_structure() — returns the TOC tree (titles + + # page ranges, no body text). + # 2. get_pages(start_page, end_page) — returns the concatenated + # content of every section whose page range overlaps. + # 3. done(answer, cited_pages, reasoning) — terminates with the + # natural-language answer plus the cited inclusive ranges. + # + # Unlike /v1/answer there's no separate synthesis call — the + # model emits the final answer inside the done tool call. The + # response carries per-page-range citations with answer-span + # quotes, a deterministic trace_token (replayable via + # /v1/replay), and an optional reasoning_trace describing every + # tool call. Streaming via SSE is available with `stream:true` + # on the request body — one event per tool call so callers + # watch the navigation in real time. + # + # OPT-OUT. Default enabled. Disable to unwire the endpoint + # (returns 501); the strategy itself can still be selected by + # setting `retrieval.strategy: pageindex` even when this block + # is disabled. + # + # Works WITHOUT a persisted TOC tree (pre-PR-A state) — the + # strategy synthesises a TOC view from the section list when + # documents.toc_tree is NULL. No request fails because of a + # missing TOC. + pageindex: + enabled: true + # Cap on LLM turns per request, including the terminal done + # turn. The reference PageIndex demo converges in 3-5 hops on + # typical questions; 8 leaves buffer for retries on parse + # failures and the occasional extra get_pages call. + max_hops: 8 + # Cap on chars one get_pages tool call returns. 16,000 ≈ 4K + # tokens — enough for a 5-7 page excerpt, well under any + # flagship model's context window. Higher values risk burning + # context budget on stray full-document fetches. + page_content_limit: 16000 + # Override the navigation-loop model; empty inherits the + # request's model (which itself falls back to the engine + # default). Most deployments leave this blank — navigation + # and answer happen in the same loop, so a "small model for + # navigation, large for answer" split doesn't apply. + model: "" + ingest: # The summarize and HyDE stages run concurrently. This caps the total # number of LLM calls in flight across both stages combined, so the diff --git a/internal/api/pageindex.go b/internal/api/pageindex.go new file mode 100644 index 0000000..92fd540 --- /dev/null +++ b/internal/api/pageindex.go @@ -0,0 +1,517 @@ +package api + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "sort" + "strings" + "sync" + "time" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/db" + "github.com/hallelx2/vectorless-engine/pkg/retrieval" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// loadTreeForPageIndex resolves the document tree for the +// pageindex answer endpoint. Routes through the optional +// PageIndexTreeLoader hook when set (tests), otherwise falls +// through to the real DB. +// +// Kept here rather than inlined in the handler so the test seam is +// obvious: production code path goes straight to d.DB.LoadTree; +// tests set d.PageIndexTreeLoader to an in-memory function. +func (d Deps) loadTreeForPageIndex(ctx context.Context, docID tree.DocumentID) (*tree.Tree, error) { + if d.PageIndexTreeLoader != nil { + return d.PageIndexTreeLoader(ctx, docID) + } + return d.DB.LoadTree(ctx, docID, standaloneOrgID, "") +} + +// pageIndexAnswerRequest is the body shape for /v1/answer/pageindex. +// +// The endpoint mirrors /v1/answer's shape but exposes the +// page-based loop's specific knobs (max_hops, max_pages_per_fetch) +// plus a streaming variant. Per-request fields override the +// PageIndexBlock config when present. +type pageIndexAnswerRequest struct { + DocumentID tree.DocumentID `json:"document_id"` + Query string `json:"query"` + Model string `json:"model"` + MaxHops int `json:"max_hops"` + MaxPagesPerFetch int `json:"max_pages_per_fetch"` // chars cap; named per the spec + Stream bool `json:"stream"` + IncludeReasoning bool `json:"reasoning"` +} + +// handleAnswerPageIndex runs the PageIndex agentic loop end-to-end +// and returns the model's answer + page-grounded citations in one +// round-trip. +// +// The loop owns the answer: there's no separate synthesis call. +// /v1/answer extracts spans per section and then synthesises; this +// endpoint asks the model to emit the answer directly inside the +// done action. Citations are per-page-range with answer-span +// quotes pulled from the cited content via the existing +// SpanExtractor. +// +// Differentiators surfaced on the response: +// - trace_token: replay any answer byte-for-byte (substrate +// reused from /v1/answer; the page-based input set is folded +// into the hash so cross-shape tokens never collide). +// - reasoning_trace: per-hop tool calls + arg summaries. Opt-in +// via request body "reasoning":true or query ?reasoning=true. +// - streaming (stream=true): SSE with one event per tool call so +// callers watch the navigation in real time. +// +// Body shape (canonical, non-streaming): +// +// POST /v1/answer/pageindex +// { "document_id": "...", "query": "...", "model"?: "...", +// "max_hops"?: 8, "max_pages_per_fetch"?: 16000, +// "stream"?: false, "reasoning"?: false } +// +// Response: see pageIndexAnswerResponse below. +func (d Deps) handleAnswerPageIndex(w http.ResponseWriter, r *http.Request) { + if d.LLM == nil { + writeErr(w, http.StatusNotImplemented, "answer/pageindex endpoint requires an LLM client") + return + } + if d.PageIndexStrategy == nil || !d.PageIndex.Enabled { + writeErr(w, http.StatusNotImplemented, "pageindex strategy not configured on this server (retrieval.pageindex.enabled=false)") + return + } + + var body pageIndexAnswerRequest + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error()) + return + } + if body.DocumentID == "" || body.Query == "" { + writeErr(w, http.StatusBadRequest, "document_id and query are required") + return + } + // Allow ?reasoning=true as an alternative to the body field. Same + // rationale as the existing /v1/query streaming flag — caller's + // choice of transport. + if r.URL.Query().Get("reasoning") == "true" { + body.IncludeReasoning = true + } + + t, err := d.loadTreeForPageIndex(r.Context(), body.DocumentID) + if err != nil { + if errors.Is(err, db.ErrNotFound) { + writeErr(w, http.StatusNotFound, "document not found") + return + } + writeErr(w, http.StatusInternalServerError, err.Error()) + return + } + + // Build a per-request strategy that wraps the engine's + // configured PageIndexStrategy. We do this because per-request + // overrides (max_hops, max_pages_per_fetch, model, OnEvent for + // streaming) must NOT mutate the shared Deps instance — Deps + // is read by many goroutines concurrently. + perReq := *d.PageIndexStrategy + if body.MaxHops > 0 { + perReq.MaxHops = body.MaxHops + } + if body.MaxPagesPerFetch > 0 { + perReq.PageContentLimit = body.MaxPagesPerFetch + } + // Per-request model override falls through to budget.ModelName + // the same way every other handler does. + + budget := retrieval.ContextBudget{ModelName: body.Model} + if budget.ModelName == "" { + budget.ModelName = d.LLMModel + } + + started := time.Now() + + // Stream variant: hijack the response writer for SSE and emit + // one event per tool call. + if body.Stream { + d.serveAnswerPageIndexStream(w, r, &perReq, t, body, budget, started) + return + } + + // Non-streaming: optionally capture reasoning trace via the + // OnEvent hook into an in-memory buffer. + var ( + traceMu sync.Mutex + trace []map[string]any + ) + if body.IncludeReasoning { + perReq.OnEvent = func(ev retrieval.PageIndexEvent) { + traceMu.Lock() + defer traceMu.Unlock() + trace = append(trace, eventToTraceMap(ev)) + } + } + + res, err := perReq.SelectWithCost(r.Context(), t, body.Query, budget) + if err != nil { + d.Logger.Error("answer/pageindex: strategy failed", "err", err, "document_id", body.DocumentID) + writeErr(w, http.StatusInternalServerError, "pageindex strategy failed: "+err.Error()) + return + } + + citations := d.buildPageIndexCitations(r.Context(), t, res, body.Query, body.Model) + + resp := map[string]any{ + "document_id": body.DocumentID, + "query": body.Query, + "answer": res.Reasoning, // strategy stores the agent's answer here + "citations": citations, + "strategy": perReq.Name(), + "model": budget.ModelName, + "hops_taken": res.HopsTaken, + "usage": map[string]any{ + "input_tokens": res.Usage.InputTokens, + "output_tokens": res.Usage.OutputTokens, + "total_tokens": res.Usage.TotalTokens, + "cost_usd": res.Usage.CostUSD, + "llm_calls": res.Usage.LLMCalls, + }, + "elapsed_ms": time.Since(started).Milliseconds(), + "trace_token": res.TraceToken, + "pages_read": res.PagesRead, + } + if body.IncludeReasoning && len(trace) > 0 { + resp["reasoning_trace"] = trace + } + + // Persist to the replay store. The trace token is keyed by + // document + sorted cited pages + model, so the same answer is + // fully replayable via the existing /v1/replay endpoint. + finalIDs := append([]tree.SectionID(nil), res.SelectedIDs...) + raw, err := marshalJSONForReplay(resp) + if err != nil { + writeJSON(w, http.StatusOK, resp) + return + } + d.writeJSONWithReplay(w, http.StatusOK, raw, res.TraceToken, retrieval.ReplayEntry{ + DocumentID: body.DocumentID, + Query: body.Query, + Model: budget.ModelName, + SelectedIDs: finalIDs, + }) +} + +// serveAnswerPageIndexStream handles the stream=true SSE variant. +// Each tool call emits one `event:` line so the caller can watch +// the navigation in real time. The final event ("answer") carries +// the full JSON response so the client doesn't need to make a +// second request. +// +// SSE format: `event: \ndata: \n\n` per the W3C spec. +func (d Deps) serveAnswerPageIndexStream(w http.ResponseWriter, r *http.Request, strat *retrieval.PageIndexStrategy, t *tree.Tree, body pageIndexAnswerRequest, budget retrieval.ContextBudget, started time.Time) { + flusher, ok := w.(http.Flusher) + if !ok { + writeErr(w, http.StatusInternalServerError, "streaming requires http.Flusher; response writer does not support it") + return + } + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.Header().Set("Connection", "keep-alive") + w.WriteHeader(http.StatusOK) + + // emitSSE serialises ev to JSON and writes one SSE record. We + // swallow write errors — a disconnected client shouldn't kill + // the strategy mid-flight; the user closing their browser is a + // normal end-state. + var writeMu sync.Mutex + emitSSE := func(eventType string, payload any) { + raw, err := json.Marshal(payload) + if err != nil { + return + } + writeMu.Lock() + defer writeMu.Unlock() + fmt.Fprintf(w, "event: %s\ndata: %s\n\n", eventType, raw) + flusher.Flush() + } + + strat.OnEvent = func(ev retrieval.PageIndexEvent) { + emitSSE(ev.Type, ev) + } + + // Started event so the client knows the loop is running. + emitSSE("started", map[string]any{ + "document_id": body.DocumentID, + "query": body.Query, + "strategy": strat.Name(), + "model": budget.ModelName, + }) + + res, err := strat.SelectWithCost(r.Context(), t, body.Query, budget) + if err != nil { + emitSSE("error", map[string]string{"error": err.Error()}) + return + } + + citations := d.buildPageIndexCitations(r.Context(), t, res, body.Query, body.Model) + final := map[string]any{ + "document_id": body.DocumentID, + "query": body.Query, + "answer": res.Reasoning, + "citations": citations, + "strategy": strat.Name(), + "model": budget.ModelName, + "hops_taken": res.HopsTaken, + "usage": map[string]any{ + "input_tokens": res.Usage.InputTokens, + "output_tokens": res.Usage.OutputTokens, + "total_tokens": res.Usage.TotalTokens, + "cost_usd": res.Usage.CostUSD, + "llm_calls": res.Usage.LLMCalls, + }, + "elapsed_ms": time.Since(started).Milliseconds(), + "trace_token": res.TraceToken, + "pages_read": res.PagesRead, + } + emitSSE("answer", final) +} + +// buildPageIndexCitations transforms the strategy's PagesRead + +// the section tree into the response's citations array. +// +// One citation per cited page range (deduplicated). Each citation +// carries: +// - start_page / end_page +// - section_ids: every section whose [PageStart,PageEnd] overlaps +// the range +// - quote / quote_start / quote_end: pulled via the existing +// SpanExtractor over the concatenated cited-page content. If the +// extractor finds no match the quote field is empty (offsets -1). +func (d Deps) buildPageIndexCitations(ctx context.Context, t *tree.Tree, res *retrieval.Result, query, requestModel string) []map[string]any { + if res == nil { + return nil + } + // Build a citation per UNIQUE page range present in PagesRead. + // The set of pages the model "read" is a superset of what it + // cited — some get_pages calls don't end up in the final + // cited_pages list — but the union is the right cone of trust + // to surface as evidence. The trace token is computed over + // only the strictly-cited ranges, which the strategy already + // has, so citation drift doesn't break replay. + seen := make(map[[2]int]struct{}, len(res.PagesRead)) + citations := make([]map[string]any, 0, len(res.PagesRead)) + + for _, pr := range res.PagesRead { + key := [2]int{pr.StartPage, pr.EndPage} + if _, dup := seen[key]; dup { + continue + } + seen[key] = struct{}{} + + c := map[string]any{ + "start_page": pr.StartPage, + "end_page": pr.EndPage, + "section_ids": pr.SectionIDs, + } + + // Quote extraction is best-effort: an LLM blip or empty + // content returns no quote, which is a normal degradation + // path. We materialise the cited content from storage and + // run one SpanExtractor call per citation. + if d.LLM != nil { + content := d.materialiseCitedContent(ctx, t, pr.SectionIDs) + if strings.TrimSpace(content) != "" { + ext := d.pageIndexSpanExtractor(requestModel) + span, _, err := ext.Extract(ctx, content, query) + if err == nil && span != nil && span.Text != "" { + c["quote"] = span.Text + if span.Start >= 0 && span.End > span.Start { + c["quote_start"] = span.Start + c["quote_end"] = span.End + } + } + } + } + + citations = append(citations, c) + } + + // Sort citations by start_page so output ordering is stable + // across runs that fetch the same set of pages in different + // orders. Stable sort preserves the original-fetch order for + // citations sharing a start page (rare in practice). + sort.SliceStable(citations, func(i, j int) bool { + return citations[i]["start_page"].(int) < citations[j]["start_page"].(int) + }) + + return citations +} + +// materialiseCitedContent loads + concatenates every cited +// section's content. Used for answer-span extraction over the +// pages the model relied on, so the quote can have real byte +// offsets back into the cited evidence. +// +// Limited to 16K chars overall (the extractor's prompt budget +// dictates this), preferring leading sections in page order so +// the quote anchors near the start of the citation when there's +// too much text to fit. +func (d Deps) materialiseCitedContent(ctx context.Context, t *tree.Tree, sectionIDs []tree.SectionID) string { + if len(sectionIDs) == 0 { + return "" + } + var ( + b strings.Builder + budget = 16000 + ) + for _, id := range sectionIDs { + if b.Len() >= budget { + break + } + sec := t.FindByID(id) + if sec == nil || sec.ContentRef == "" { + continue + } + rc, _, err := d.Storage.Get(ctx, sec.ContentRef) + if err != nil { + continue + } + raw, err := io.ReadAll(rc) + rc.Close() + if err != nil { + continue + } + text := strings.TrimSpace(string(raw)) + remaining := budget - b.Len() + if remaining <= 0 { + break + } + if len(text) > remaining { + text = text[:remaining] + } + b.WriteString(text) + b.WriteString("\n\n") + } + return b.String() +} + +// pageIndexSpanExtractor builds a SpanExtractor configured for the +// /v1/answer/pageindex endpoint. Same fall-through pattern as the +// existing spanExtractor helper (config override → request model → +// engine default). +func (d Deps) pageIndexSpanExtractor(requestModel string) *retrieval.SpanExtractor { + model := d.AnswerSpan.Model + if model == "" { + model = requestModel + } + if model == "" { + model = d.LLMModel + } + ext := retrieval.NewSpanExtractor(d.LLM, model) + if d.AnswerSpan.MaxQuoteLen > 0 { + ext.MaxQuoteLen = d.AnswerSpan.MaxQuoteLen + } + return ext +} + +// eventToTraceMap converts a PageIndexEvent into the +// reasoning_trace entry shape. Only documented fields ship — +// nothing internal leaks via the trace. +func eventToTraceMap(ev retrieval.PageIndexEvent) map[string]any { + args := map[string]any{} + switch ev.Type { + case "get_pages": + if ev.StartPage > 0 { + args["start_page"] = ev.StartPage + } + if ev.EndPage > 0 { + args["end_page"] = ev.EndPage + } + case "done": + if len(ev.CitedPages) > 0 { + args["cited_pages"] = ev.CitedPages + } + } + entry := map[string]any{ + "hop": ev.Hop, + "tool": ev.Type, + } + if len(args) > 0 { + entry["args"] = args + } + if ev.Reasoning != "" { + entry["reasoning"] = ev.Reasoning + } + if ev.Note != "" { + entry["note"] = ev.Note + } + if ev.CharCount > 0 { + entry["result_chars"] = ev.CharCount + } + if len(ev.SectionIDs) > 0 { + entry["sections_touched"] = ev.SectionIDs + } + if ev.Answer != "" { + // The final-hop "done" event carries the answer; surfacing + // it in the trace lets a debugger see the agent's literal + // final-turn output alongside the formal response field. + entry["answer"] = ev.Answer + } + return entry +} + +// pageIndexTraceTokenFromCitations exposes the same hash a +// PageIndexStrategy emits to callers who want to recompute the +// token client-side. The page-range string form mirrors the one +// the strategy uses internally so the two stay in lock-step. +// +// Unused at the moment but useful for tests that want to assert +// the in-response trace_token against the canonical input set — +// kept here rather than exported from the retrieval package so +// the API layer owns its own input wiring. +func pageIndexTraceTokenFromCitations(docID tree.DocumentID, model string, ranges [][2]int) string { + strs := make([]string, 0, len(ranges)) + for _, r := range ranges { + if r[0] == r[1] { + strs = append(strs, fmt.Sprintf("%d", r[0])) + } else { + strs = append(strs, fmt.Sprintf("%d-%d", r[0], r[1])) + } + } + sort.Strings(strs) + h := sha256.New() + h.Write([]byte(string(docID))) + h.Write([]byte{0}) + h.Write([]byte("1-pages")) + h.Write([]byte{0}) + h.Write([]byte("pageindex:" + model)) + h.Write([]byte{0}) + h.Write([]byte(retrieval.SystemPromptVersion)) + for i, s := range strs { + if i == 0 { + h.Write([]byte{0}) + } else { + h.Write([]byte{0}) + } + h.Write([]byte("p:" + s)) + } + return hex.EncodeToString(h.Sum(nil)) +} + +// Compile-time guard: the PageIndex strategy must satisfy +// retrieval.CostStrategy so SelectWithCost works without a +// type-assert dance. +var _ retrieval.CostStrategy = (*retrieval.PageIndexStrategy)(nil) + +// Compile-time check that the Deps fields the handler reads are +// the only API-layer dependencies it pulls in. If a future edit +// adds a new dependency here the linter / vet will catch it via +// the unused-import path. +var _ llmgate.Client = (llmgate.Client)(nil) diff --git a/internal/api/pageindex_test.go b/internal/api/pageindex_test.go new file mode 100644 index 0000000..d76bfa4 --- /dev/null +++ b/internal/api/pageindex_test.go @@ -0,0 +1,531 @@ +package api + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/go-chi/chi/v5" + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/config" + "github.com/hallelx2/vectorless-engine/pkg/db" + "github.com/hallelx2/vectorless-engine/pkg/retrieval" + "github.com/hallelx2/vectorless-engine/pkg/storage" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// pageIndexScriptedLLM is the same shape as the strategy test's +// scripted LLM but mirrored here so the api package's tests don't +// reach into pkg/retrieval's test file. +type pageIndexScriptedLLM struct { + replies []string + calls int32 +} + +func (p *pageIndexScriptedLLM) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + i := int(atomic.AddInt32(&p.calls, 1)) - 1 + if i >= len(p.replies) { + return nil, fmt.Errorf("scripted LLM exhausted at call %d", i+1) + } + return &llmgate.Response{Content: p.replies[i]}, nil +} + +func (p *pageIndexScriptedLLM) CountTokens(ctx context.Context, t string) (int, error) { + return len(t) / 4, nil +} + +// inMemoryStorage is a minimal storage.Storage backed by a map. +// Only Get is meaningful for the pageindex handler tests. +type inMemoryStorage struct { + data map[string][]byte +} + +func (m *inMemoryStorage) Put(ctx context.Context, key string, r io.Reader, meta storage.Metadata) error { + b, err := io.ReadAll(r) + if err != nil { + return err + } + if m.data == nil { + m.data = map[string][]byte{} + } + m.data[key] = b + return nil +} + +func (m *inMemoryStorage) Get(ctx context.Context, key string) (io.ReadCloser, storage.Metadata, error) { + b, ok := m.data[key] + if !ok { + return nil, storage.Metadata{}, storage.ErrNotFound + } + return io.NopCloser(bytes.NewReader(b)), storage.Metadata{Key: key, Size: int64(len(b))}, nil +} + +func (m *inMemoryStorage) Delete(ctx context.Context, key string) error { return nil } + +func (m *inMemoryStorage) Exists(ctx context.Context, key string) (bool, error) { + _, ok := m.data[key] + return ok, nil +} + +func (m *inMemoryStorage) SignedURL(ctx context.Context, key string, expiry time.Duration) (string, error) { + return "", nil +} + +// pageIndexHandlerRouter wires only the endpoint under test. We +// don't want middleware noise interfering with the assertion path. +func pageIndexHandlerRouter(d Deps) http.Handler { + r := chi.NewRouter() + r.Route("/v1", func(r chi.Router) { + r.Post("/answer/pageindex", d.handleAnswerPageIndex) + }) + return r +} + +// buildPageIndexTestTree mirrors the strategy tests' tree so +// assertions about which section IDs surface in citations stay +// consistent across the two suites. +func buildPageIndexTestTree() *tree.Tree { + a1 := &tree.Section{ID: "sec_a1", ParentID: "sec_a", Title: "Install", Summary: "install steps", ContentRef: "a1_ref", PageStart: 1, PageEnd: 2} + a2 := &tree.Section{ID: "sec_a2", ParentID: "sec_a", Title: "Config", Summary: "config keys", ContentRef: "a2_ref", PageStart: 3, PageEnd: 4} + b1 := &tree.Section{ID: "sec_b1", ParentID: "sec_b", Title: "Querying", Summary: "how to query", ContentRef: "b1_ref", PageStart: 5, PageEnd: 7} + b2 := &tree.Section{ID: "sec_b2", ParentID: "sec_b", Title: "Debt", Summary: "long-term debt", ContentRef: "b2_ref", PageStart: 8, PageEnd: 9} + a := &tree.Section{ID: "sec_a", ParentID: "sec_root", Title: "Setup", Summary: "setup", Children: []*tree.Section{a1, a2}, PageStart: 1, PageEnd: 4} + b := &tree.Section{ID: "sec_b", ParentID: "sec_root", Title: "Usage", Summary: "usage", Children: []*tree.Section{b1, b2}, PageStart: 5, PageEnd: 9} + root := &tree.Section{ID: "sec_root", Title: "Atlas", Children: []*tree.Section{a, b}, PageStart: 1, PageEnd: 9} + return &tree.Tree{DocumentID: "doc_x", Title: "Atlas", Root: root} +} + +// newTestDeps wires the minimum surface for the pageindex handler +// to run end-to-end against httptest. The strategy is constructed +// directly (no DB / cache wrapper) so per-test LLM scripting +// drives behaviour deterministically. +func newTestDeps(t *testing.T, replies ...string) (Deps, *pageIndexScriptedLLM, *inMemoryStorage) { + t.Helper() + + llm := &pageIndexScriptedLLM{replies: replies} + store := &inMemoryStorage{data: map[string][]byte{ + "a1_ref": []byte("Install steps: run vle ingest..."), + "a2_ref": []byte("Config keys: VLE_FOO, VLE_BAR."), + "b1_ref": []byte("How to query the API."), + "b2_ref": []byte("Debt registration is in line items A and B."), + }} + strat := retrieval.NewPageIndexStrategy(llm) + strat.PageLoader = pageStorageLoader{s: store} + + deps := Deps{ + Logger: slog.Default(), + Storage: store, + LLM: llm, + LLMModel: "test-model", + Strategy: strat, // unrelated to /v1/answer/pageindex; populated for sanity + PageIndexStrategy: strat, + PageIndex: config.PageIndexBlock{Enabled: true, MaxHops: 8, PageContentLimit: 16000}, + AnswerSpan: config.AnswerSpanBlock{Enabled: false}, + Replay: retrieval.NewLRUReplayStore(retrieval.LRUReplayConfig{ + MaxEntries: 16, + TTL: 5 * time.Minute, + }), + PageIndexTreeLoader: func(ctx context.Context, docID tree.DocumentID) (*tree.Tree, error) { + if docID != "doc_x" { + return nil, fmt.Errorf("unknown document %q (test loader only knows doc_x)", docID) + } + return buildPageIndexTestTree(), nil + }, + } + return deps, llm, store +} + +// pageStorageLoader adapts the in-memory storage to the +// PageContentLoader interface the strategy expects. The +// production engine uses an identical adapter in cmd/engine/main.go; +// duplicating it here keeps the test self-contained. +type pageStorageLoader struct{ s storage.Storage } + +func (l pageStorageLoader) Load(ctx context.Context, ref string) ([]byte, error) { + rc, _, err := l.s.Get(ctx, ref) + if err != nil { + return nil, err + } + defer rc.Close() + return io.ReadAll(rc) +} + +// TestHandleAnswerPageIndexHappyPath: the canonical 3-tool +// sequence ends with a JSON response carrying answer, citations, +// hops_taken, trace_token, pages_read, and a usage block. The +// LLM is NOT called for span extraction in this test path because +// AnswerSpan.Enabled is false at the config-block level — but the +// citations still surface section_ids and page ranges. +func TestHandleAnswerPageIndexHappyPath(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t, + `{"tool":"get_document_structure","reasoning":"orient"}`, + `{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"install lives here"}`, + `{"tool":"done","answer":"Run vle ingest.","cited_pages":[[1,2]],"reasoning":"install on pages 1-2"}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"how do I install?","model":"test-model"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { + t.Fatalf("unmarshal response: %v\n%s", err, rec.Body.String()) + } + if resp["answer"].(string) != "Run vle ingest." { + t.Errorf("answer = %v, want \"Run vle ingest.\"", resp["answer"]) + } + if resp["strategy"].(string) != "pageindex" { + t.Errorf("strategy = %v, want pageindex", resp["strategy"]) + } + if resp["hops_taken"].(float64) != 3 { + t.Errorf("hops_taken = %v, want 3", resp["hops_taken"]) + } + if resp["trace_token"].(string) == "" { + t.Error("trace_token must be non-empty on success") + } + cits, ok := resp["citations"].([]any) + if !ok || len(cits) == 0 { + t.Fatalf("citations missing or empty: %v", resp["citations"]) + } + first := cits[0].(map[string]any) + if first["start_page"].(float64) != 1 || first["end_page"].(float64) != 2 { + t.Errorf("first citation page range = %v-%v, want 1-2", first["start_page"], first["end_page"]) + } + secs, ok := first["section_ids"].([]any) + if !ok || len(secs) == 0 { + t.Errorf("first citation must list section_ids, got %v", first["section_ids"]) + } + // pages_read must surface the get_pages invocation + pages, ok := resp["pages_read"].([]any) + if !ok || len(pages) != 1 { + t.Errorf("pages_read = %v, want 1 entry", resp["pages_read"]) + } +} + +// TestHandleAnswerPageIndexReasoningTrace: with reasoning=true, +// the response carries a reasoning_trace array describing each +// tool call. Each entry must have hop + tool + (optional) args. +func TestHandleAnswerPageIndexReasoningTrace(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t, + `{"tool":"get_document_structure","reasoning":"orient"}`, + `{"tool":"get_pages","start_page":3,"end_page":4,"reasoning":"look at config"}`, + `{"tool":"done","answer":"Config keys are VLE_*","cited_pages":[[3,4]],"reasoning":"config on 3-4"}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"config?","reasoning":true}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + + var resp map[string]any + _ = json.Unmarshal(rec.Body.Bytes(), &resp) + trace, ok := resp["reasoning_trace"].([]any) + if !ok || len(trace) != 3 { + t.Fatalf("reasoning_trace = %v, want 3 entries", resp["reasoning_trace"]) + } + // Every entry must have a hop number and a tool tag. + for i, raw := range trace { + entry := raw.(map[string]any) + if _, ok := entry["hop"]; !ok { + t.Errorf("trace entry %d missing hop", i) + } + if _, ok := entry["tool"]; !ok { + t.Errorf("trace entry %d missing tool", i) + } + } + // The get_pages entry must surface its args. + if args, ok := trace[1].(map[string]any)["args"].(map[string]any); !ok { + t.Errorf("get_pages trace entry missing args, got %v", trace[1]) + } else { + if args["start_page"].(float64) != 3 || args["end_page"].(float64) != 4 { + t.Errorf("trace args = %v, want start=3 end=4", args) + } + } +} + +// TestHandleAnswerPageIndexReasoningTraceQueryParam: the +// ?reasoning=true query param is an alternative to the body field. +// Some clients prefer it for GET-friendliness when prototyping. +func TestHandleAnswerPageIndexReasoningTraceQueryParam(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t, + `{"tool":"done","answer":"x","cited_pages":[]}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"q"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex?reasoning=true", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + + var resp map[string]any + _ = json.Unmarshal(rec.Body.Bytes(), &resp) + if _, ok := resp["reasoning_trace"]; !ok { + t.Errorf("?reasoning=true must produce reasoning_trace, got: %v", resp) + } +} + +// TestHandleAnswerPageIndexBadRequest: missing document_id / +// query → 400. +func TestHandleAnswerPageIndexBadRequest(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t) + + for _, body := range []string{ + `{}`, + `{"document_id":"doc_x"}`, // missing query + `{"query":"q"}`, // missing document_id + `not-json`, + } { + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", strings.NewReader(body)) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusBadRequest { + t.Errorf("body %q: status = %d, want 400", body, rec.Code) + } + } +} + +// TestHandleAnswerPageIndexDocumentNotFound: a tree-loader that +// returns ErrNotFound bubbles up as 404. The test loader rejects +// unknown doc IDs. +func TestHandleAnswerPageIndexDocumentNotFound(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t) + // Re-wire the loader to return ErrNotFound for the right error + // path. The default test loader returns a generic error + // (different status — also valid but less specific). + deps.PageIndexTreeLoader = func(ctx context.Context, docID tree.DocumentID) (*tree.Tree, error) { + return nil, dbNotFoundError() + } + + body := strings.NewReader(`{"document_id":"missing","query":"q"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusNotFound { + t.Errorf("status = %d, want 404 (body: %s)", rec.Code, rec.Body.String()) + } +} + +// TestHandleAnswerPageIndexDisabled: when PageIndex.Enabled=false +// or PageIndexStrategy is nil, the endpoint returns 501. Two +// failure modes, both must produce the same status. +func TestHandleAnswerPageIndexDisabled(t *testing.T) { + t.Parallel() + + // Mode 1: config disabled. + deps, _, _ := newTestDeps(t) + deps.PageIndex.Enabled = false + + body := strings.NewReader(`{"document_id":"doc_x","query":"q"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusNotImplemented { + t.Errorf("config disabled: status = %d, want 501", rec.Code) + } + + // Mode 2: strategy nil. + deps2, _, _ := newTestDeps(t) + deps2.PageIndexStrategy = nil + + body = strings.NewReader(`{"document_id":"doc_x","query":"q"}`) + req = httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec = httptest.NewRecorder() + pageIndexHandlerRouter(deps2).ServeHTTP(rec, req) + if rec.Code != http.StatusNotImplemented { + t.Errorf("strategy nil: status = %d, want 501", rec.Code) + } +} + +// TestHandleAnswerPageIndexNoLLM: no LLM client → 501. +func TestHandleAnswerPageIndexNoLLM(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t) + deps.LLM = nil + + body := strings.NewReader(`{"document_id":"doc_x","query":"q"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusNotImplemented { + t.Errorf("status = %d, want 501", rec.Code) + } +} + +// TestHandleAnswerPageIndexReplayPersisted: the response is +// stored in the replay store under its trace_token, and the +// existing /v1/replay handler returns the byte-identical body. +func TestHandleAnswerPageIndexReplayPersisted(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t, + `{"tool":"done","answer":"X","cited_pages":[[5,7]]}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"replay-me"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("first call: status = %d, body = %s", rec.Code, rec.Body.String()) + } + originalBody := rec.Body.Bytes() + + var resp map[string]any + _ = json.Unmarshal(originalBody, &resp) + token := resp["trace_token"].(string) + if token == "" { + t.Fatal("trace_token must be populated for replay") + } + + // Now hit /v1/replay with the same token + query + doc id. + r2 := chi.NewRouter() + r2.Route("/v1", func(r chi.Router) { + r.Post("/replay", deps.handleReplay) + }) + replayBody := strings.NewReader(fmt.Sprintf(`{"trace_token":%q,"query":"replay-me","document_id":"doc_x"}`, token)) + req2 := httptest.NewRequest(http.MethodPost, "/v1/replay", replayBody) + rec2 := httptest.NewRecorder() + r2.ServeHTTP(rec2, req2) + if rec2.Code != http.StatusOK { + t.Fatalf("replay status = %d, body = %s", rec2.Code, rec2.Body.String()) + } + // The original /v1/answer/pageindex response carries a trailing + // newline from marshalJSONForReplay; the replay path returns + // the exact stored bytes, so we compare with the newline. + if !bytes.Equal(originalBody, rec2.Body.Bytes()) { + t.Errorf("replay bytes differ from original\nORIG: %s\nREP : %s", originalBody, rec2.Body.Bytes()) + } +} + +// TestHandleAnswerPageIndexStreaming: with stream=true, the +// response is SSE with one event per tool call plus a started + +// answer event. The data payloads are JSON. +func TestHandleAnswerPageIndexStreaming(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t, + `{"tool":"get_document_structure"}`, + `{"tool":"get_pages","start_page":1,"end_page":2}`, + `{"tool":"done","answer":"streamed","cited_pages":[[1,2]]}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"q","stream":true}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("stream status = %d", rec.Code) + } + if ct := rec.Header().Get("Content-Type"); ct != "text/event-stream" { + t.Errorf("Content-Type = %q, want text/event-stream", ct) + } + out := rec.Body.String() + for _, want := range []string{ + "event: started", + "event: get_document_structure", + "event: get_pages", + "event: done", + "event: answer", + } { + if !strings.Contains(out, want) { + t.Errorf("missing SSE %q in stream body:\n%s", want, out) + } + } +} + +// TestHandleAnswerPageIndexPerRequestOverrides: max_hops and +// max_pages_per_fetch on the body override the engine's config. +// We can't measure max_pages_per_fetch from outside (it shapes +// content size, not response shape), but we can verify max_hops +// caps the loop. Set max_hops=1 and a script that emits +// 5 turns — the strategy must stop after 1. +func TestHandleAnswerPageIndexPerRequestOverrides(t *testing.T) { + t.Parallel() + + // 6 replies but max_hops=1 → only the first runs as a normal + // hop, then forceDone kicks in (2 LLM calls total counting the + // force-done turn). The model never emits a valid done, so + // the response answer is empty. + deps, _, _ := newTestDeps(t, + `{"tool":"get_pages","start_page":1,"end_page":2}`, + `{"tool":"get_pages","start_page":3,"end_page":4}`, + `{"tool":"get_pages","start_page":5,"end_page":6}`, + `{"tool":"get_pages","start_page":7,"end_page":9}`, + `{"tool":"get_pages","start_page":1,"end_page":1}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"q","max_hops":1}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + + var resp map[string]any + _ = json.Unmarshal(rec.Body.Bytes(), &resp) + // hops_taken includes the forced done turn, so cap=1 → at most + // 2 actual calls. + if hops, ok := resp["hops_taken"].(float64); !ok || hops > 2 { + t.Errorf("hops_taken = %v, want <=2 (max_hops=1 + 1 force-done)", resp["hops_taken"]) + } +} + +// TestHandleAnswerPageIndexTOCFallback: with a tree that has +// page metadata but no persisted TOC, the synthesised TOC drives +// the get_document_structure tool. This test runs end-to-end and +// asserts the response shape; the strategy-level test covers the +// synthesis logic directly. +func TestHandleAnswerPageIndexTOCFallback(t *testing.T) { + t.Parallel() + + deps, _, _ := newTestDeps(t, + `{"tool":"get_document_structure"}`, + `{"tool":"done","answer":"saw the toc","cited_pages":[]}`, + ) + // PageIndexStrategy.TOC is left nil — the synthesised path is + // the default for any deployment without PR-A merged. + + body := strings.NewReader(`{"document_id":"doc_x","query":"what is in the doc?"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + var resp map[string]any + _ = json.Unmarshal(rec.Body.Bytes(), &resp) + if resp["answer"].(string) != "saw the toc" { + t.Errorf("answer = %v, want 'saw the toc'", resp["answer"]) + } +} + +// dbNotFoundError returns the real db.ErrNotFound sentinel so the +// handler's errors.Is(err, db.ErrNotFound) check fires. +func dbNotFoundError() error { + return db.ErrNotFound +} diff --git a/internal/api/server.go b/internal/api/server.go index b65ab78..e0dd375 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -98,6 +98,28 @@ type Deps struct { // carries abstained=true and an empty sections / citations list // rather than risk hallucinating an answer from weak evidence. Abstain config.AbstainBlock + + // PageIndexStrategy is the dedicated page-based agentic strategy + // instance used by /v1/answer/pageindex. Wired in main.go from + // the same storage backend the rest of the engine uses, even + // when the selection strategy chosen by retrieval.strategy is + // something else. Nil disables the endpoint (returns 501) along + // with PageIndex.Enabled=false. + PageIndexStrategy *retrieval.PageIndexStrategy + + // PageIndex carries the server-side config for the page-based + // answer endpoint. The body-level fields max_hops / + // max_pages_per_fetch on /v1/answer/pageindex override + // PageIndex.MaxHops / PageIndex.PageContentLimit per request. + PageIndex config.PageIndexBlock + + // PageIndexTreeLoader is a test seam that overrides how the + // /v1/answer/pageindex handler resolves the document tree. + // Nil routes through d.DB.LoadTree (the production path). + // Tests set this to a deterministic in-memory function so the + // handler can run end-to-end via httptest without a real + // Postgres backend. + PageIndexTreeLoader func(ctx context.Context, docID tree.DocumentID) (*tree.Tree, error) } // Router builds and returns the chi router wired with v1 routes. @@ -124,6 +146,7 @@ func Router(d Deps) http.Handler { r.Post("/query", d.handleQuery) r.Post("/query/multi", d.handleQueryMulti) r.Post("/answer", d.handleAnswer) + r.Post("/answer/pageindex", d.handleAnswerPageIndex) r.Post("/replay", d.handleReplay) }) diff --git a/openapi.yaml b/openapi.yaml index 89dc769..ddd8022 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -279,6 +279,93 @@ paths: "501": description: Endpoint not available — no LLM client configured + /v1/answer/pageindex: + post: + tags: [Query] + summary: PageIndex-style page-based agentic answer + operationId: answerPageIndex + description: | + Quote-grounded answer endpoint backed by the page-based + agentic strategy. The model navigates the document via a + three-tool loop: + + 1. get_document_structure() — returns the TOC tree + (titles + page ranges, no body text). + 2. get_pages(start_page, end_page) — returns the + concatenated content of every section whose page + range overlaps the requested range. + 3. done(answer, cited_pages, reasoning) — terminates + with the natural-language answer plus the inclusive + page ranges the answer relies on. + + Unlike /v1/answer this endpoint runs no separate + synthesis call — the model produces the final answer + inside the done tool call. Citations are per-page-range + with answer-span quotes pulled from the cited content. + + Differentiators: + - `trace_token` substrate is reused: the page-based + input set (sorted cited page ranges) is folded into + the same sha256 the per-section endpoints use, so + /v1/replay accepts page-index responses uniformly. + - `reasoning_trace` lists every tool call the agent + made, opt-in via body `reasoning:true` or query + `?reasoning=true`. + - Streaming (`stream:true`) returns Server-Sent + Events with one event per tool call. + + Gracefully degrades when no LLM-built TOC tree has + been persisted for the document yet — the strategy + synthesises a TOC view from the section list rather + than failing the request. + parameters: + - name: reasoning + in: query + required: false + description: | + Alternative to the body's `reasoning` field. When set + to `true` the response carries a `reasoning_trace` + array describing every tool call. + schema: + type: boolean + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/PageIndexAnswerRequest" + responses: + "200": + description: | + Synthesised answer plus page-grounded citations. When + `stream` is `true`, the response is Server-Sent + Events (Content-Type `text/event-stream`) rather than + JSON; see the description for event types. + content: + application/json: + schema: + $ref: "#/components/schemas/PageIndexAnswerResponse" + text/event-stream: + schema: + type: string + description: | + SSE event stream. Event types emitted in + order: `started`, one event per tool call + (`get_document_structure`, `get_pages`, + `done`), and finally `answer` carrying the + full PageIndexAnswerResponse payload. On + error a single `error` event is emitted. + "400": + description: Missing or invalid request body. + "404": + $ref: "#/components/responses/NotFound" + "501": + description: | + Endpoint not available. Either the server has no LLM + client configured, or + `retrieval.pageindex.enabled=false`, or the + PageIndexStrategy instance was not wired at boot. + /v1/replay: post: tags: [Query] @@ -578,7 +665,7 @@ components: type: string strategy: type: string - enum: [single-pass, chunked-tree, agentic] + enum: [single-pass, chunked-tree, agentic, pageindex] model: type: string sections: @@ -948,6 +1035,249 @@ components: re-rank ran. Higher means the section is more directly relevant to the query. + PageIndexAnswerRequest: + type: object + description: | + Body for POST /v1/answer/pageindex. The endpoint exposes + per-request overrides for the page-based loop's caps + (max_hops, max_pages_per_fetch) alongside the standard + document_id / query / model fields. + required: [document_id, query] + properties: + document_id: + type: string + query: + type: string + description: Natural-language query. + model: + type: string + description: | + Override the LLM model used by the navigation loop. + Falls back to `retrieval.pageindex.model`, then to the + engine's default model. + max_hops: + type: integer + description: | + Cap on the number of LLM turns the loop consumes, + counting the terminal `done` turn. Overrides + `retrieval.pageindex.max_hops` for this request only. + Default at the server level is 8. + max_pages_per_fetch: + type: integer + description: | + Cap on the characters one `get_pages` tool call may + return. Keeps a stray full-document fetch from + torching the model's context window. Overrides + `retrieval.pageindex.page_content_limit`. Default + 16000. + stream: + type: boolean + description: | + When true, the response is Server-Sent Events: one + event per tool call (get_document_structure / + get_pages / done) followed by a terminal `answer` + event carrying the full PageIndexAnswerResponse + payload. Lets the caller WATCH the agent navigate + in real time rather than waiting for the final + answer. + reasoning: + type: boolean + description: | + When true, the response carries a `reasoning_trace` + array describing every tool call the agent made + (hop, tool tag, args, optional sections-touched and + result-chars metadata). Equivalent to the + `?reasoning=true` query parameter. + + PageIndexAnswerResponse: + type: object + description: | + Non-streaming response shape. The `answer` field carries + the agent's natural-language reply produced inside the + terminal `done` tool call (no separate synthesis call). + Citations are per-page-range (not per-section) with + answer-span quotes pulled from the cited content. + properties: + document_id: + type: string + query: + type: string + answer: + type: string + description: | + Natural-language answer the agent emitted inside its + terminal `done` tool call. Empty when the loop hit + max_hops without the model finalising — callers + should fall through to a manual /v1/query or retry. + citations: + type: array + items: + $ref: "#/components/schemas/PageIndexCitation" + strategy: + type: string + enum: [pageindex] + model: + type: string + hops_taken: + type: integer + description: | + Number of LLM turns the loop consumed, counting any + forced terminal turn fired when max_hops was hit. + usage: + type: object + properties: + input_tokens: {type: integer} + output_tokens: {type: integer} + total_tokens: {type: integer} + cost_usd: {type: number} + llm_calls: {type: integer} + elapsed_ms: + type: integer + trace_token: + type: string + description: | + Deterministic 64-char hex sha256 token over the + document, system-prompt version, "pageindex:" model + tag, and sorted cited page ranges (e.g. ["1-2","5-7"]). + Two runs that cite the same pages — even via + different navigation paths — collapse to the same + token. Pass to /v1/replay with the original `query` + and `document_id` to fetch the byte-identical + response. + pages_read: + type: array + items: + $ref: "#/components/schemas/PageReadEntry" + description: | + Every `get_pages` invocation the loop made, + including ones whose ranges the model did not end + up citing. Useful for cost debugging and for + building a "navigation timeline" UI. + reasoning_trace: + type: array + items: + $ref: "#/components/schemas/PageIndexTraceEntry" + description: | + Per-hop tool calls + arg summaries. Present only + when the request opted in via `reasoning:true` or + `?reasoning=true`. + + PageIndexCitation: + type: object + description: | + One citation behind the agent's answer. The pages + identify the cited range; `section_ids` lists every + section whose page range overlaps it; `quote` is the + verbatim answer-span the extractor judged most + relevant. When the extractor finds nothing in the + cited content the `quote` field is omitted (no + sentinel: absent means absent). + properties: + start_page: + type: integer + description: Inclusive first page in the citation. + end_page: + type: integer + description: Inclusive last page in the citation. + section_ids: + type: array + items: + type: string + description: | + Every section whose [page_start,page_end] overlaps + this citation, in document order. + quote: + type: string + description: | + Verbatim quote from the cited content judged most + relevant to the query. Omitted when no quote was + extracted (e.g. extractor returned nothing). + quote_start: + type: integer + description: | + Byte offset of the quote in the concatenated cited + content. -1 when the extractor returned a quote but + could not locate it verbatim (typically a + paraphrase). + quote_end: + type: integer + + PageReadEntry: + type: object + description: | + One `get_pages` call the agent made during navigation. + Surfaces what the model actually fetched (sections + touched + char count) rather than what it cited, so a + reviewer can audit the full evidence cone. + properties: + start_page: + type: integer + end_page: + type: integer + section_ids: + type: array + items: + type: string + char_count: + type: integer + description: | + Number of characters returned to the model after + clipping at `page_content_limit`. Bytes-on-the-wire, + not bytes-requested. + + PageIndexTraceEntry: + type: object + description: | + One tool call in the navigation timeline. Lets a + reviewer watch the agent's reasoning without paying for + a full streaming connection. + properties: + hop: + type: integer + description: 1-indexed turn number. + tool: + type: string + enum: [get_document_structure, get_pages, done] + reasoning: + type: string + description: | + One-sentence rationale the agent emitted alongside + the tool call. Populated only when the model + actually filled the `reasoning` field on the call. + args: + type: object + description: | + Tool-specific arguments. For `get_pages`, + `start_page` and `end_page`; for `done`, + `cited_pages`. Omitted for `get_document_structure` + (no args). + result_chars: + type: integer + description: | + Number of characters the tool result returned. + Populated for `get_pages` and (when present) + `get_document_structure`. + sections_touched: + type: array + items: + type: string + description: | + Section IDs the `get_pages` call materialised + content from. + answer: + type: string + description: | + The agent's final-turn answer string. Present only + on the terminal `done` entry; mirrors the + top-level `answer` field for debuggers reading the + trace. + note: + type: string + description: | + Optional human-readable note (e.g. "invalid range" + when the model asked for pages past the document's + end). + ReplayRequest: type: object description: | diff --git a/pkg/config/config.go b/pkg/config/config.go index d4f2723..0f0e291 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -289,6 +289,54 @@ type RetrievalConfig struct { ReRank ReRankBlock `yaml:"rerank"` Replay ReplayBlock `yaml:"replay"` Abstain AbstainBlock `yaml:"abstain"` + PageIndex PageIndexBlock `yaml:"pageindex"` +} + +// PageIndexBlock configures the PageIndex page-based agentic +// strategy and its dedicated /v1/answer/pageindex endpoint. +// +// The strategy is registered as a Strategy choice +// (retrieval.strategy: pageindex) AND is wired into the +// /v1/answer/pageindex endpoint regardless of which selection +// strategy the server runs by default. The Enabled flag controls +// the endpoint only — flipping it off does not unregister the +// strategy, so a deployment that wants the strategy available +// to /v1/query but not the dedicated answer endpoint can still +// disable the endpoint here. +// +// Defaults are tuned to match the reference PageIndex demo: 8 +// hops covers structure → 3 navigation calls → done + buffer, +// and 16,000 chars of get_pages content fits a 5-7 page excerpt +// comfortably under any flagship model's context window. +// +// Per-request overrides on /v1/answer/pageindex (max_hops, +// max_pages_per_fetch) win over this block; this block is the +// server-side default. +type PageIndexBlock struct { + // Enabled toggles the /v1/answer/pageindex endpoint. Default: + // true. When false, the endpoint returns 501. The + // PageIndexStrategy itself stays registered as a selection + // strategy regardless — disabling here only unwires the + // dedicated answer surface. + Enabled bool `yaml:"enabled"` + + // MaxHops caps the number of LLM turns one /v1/answer/pageindex + // request consumes, including the terminal done turn. Default: + // 8. Set to 0 to use the strategy's built-in default. + MaxHops int `yaml:"max_hops"` + + // PageContentLimit caps how many chars a single get_pages tool + // call returns. Default: 16000. Keeps one stray full-document + // fetch from torching the model's context window. + PageContentLimit int `yaml:"page_content_limit"` + + // Model overrides the LLM model used for the navigation loop. + // Empty means use the request's model (which itself falls back + // to the engine default). Useful when navigation should run on + // a fast/cheap model while answering benefits from a stronger + // one — though the PageIndex protocol does both in the same + // loop, so most deployments leave this blank. + Model string `yaml:"model"` } // AbstainBlock configures the Phase 2.4 abstention behaviour. @@ -563,6 +611,11 @@ func Default() Config { Enabled: true, Below: 0.4, }, + PageIndex: PageIndexBlock{ + Enabled: true, + MaxHops: 8, + PageContentLimit: 16000, + }, }, Ingest: IngestConfig{ GlobalLLMConcurrency: 12, @@ -867,6 +920,27 @@ func applyEnvOverrides(c *Config) { c.Retrieval.Abstain.Below = f } } + if v := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED"); v != "" { + switch strings.ToLower(strings.TrimSpace(v)) { + case "1", "true", "yes", "on": + c.Retrieval.PageIndex.Enabled = true + case "0", "false", "no", "off": + c.Retrieval.PageIndex.Enabled = false + } + } + if v := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Retrieval.PageIndex.MaxHops = n + } + } + if v := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Retrieval.PageIndex.PageContentLimit = n + } + } + if v := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MODEL"); v != "" { + c.Retrieval.PageIndex.Model = v + } } // Validate checks that required fields for the selected drivers are set. @@ -920,7 +994,7 @@ func (c Config) Validate() error { } switch c.Retrieval.Strategy { - case "single-pass", "chunked-tree", "agentic": + case "single-pass", "chunked-tree", "agentic", "pageindex": default: return fmt.Errorf("unknown retrieval.strategy: %q", c.Retrieval.Strategy) } @@ -1000,5 +1074,12 @@ func (c Config) Validate() error { return fmt.Errorf("retrieval.abstain.below must be in [0.0, 1.0], got %v", c.Retrieval.Abstain.Below) } + if c.Retrieval.PageIndex.MaxHops < 0 { + return fmt.Errorf("retrieval.pageindex.max_hops must be >= 0, got %d", c.Retrieval.PageIndex.MaxHops) + } + if c.Retrieval.PageIndex.PageContentLimit < 0 { + return fmt.Errorf("retrieval.pageindex.page_content_limit must be >= 0, got %d", c.Retrieval.PageIndex.PageContentLimit) + } + return nil } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index e4ba3a6..43cd902 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -525,7 +525,7 @@ func TestValidateLLMDrivers(t *testing.T) { func TestValidateRetrievalStrategy(t *testing.T) { t.Parallel() - for _, s := range []string{"single-pass", "chunked-tree"} { + for _, s := range []string{"single-pass", "chunked-tree", "agentic", "pageindex"} { cfg := Default() cfg.Database.URL = "postgres://localhost/test" cfg.Retrieval.Strategy = s @@ -542,6 +542,126 @@ func TestValidateRetrievalStrategy(t *testing.T) { } } +// TestPageIndexDefaults locks in the PageIndex block's defaults so +// a regression on shipping values is loud. Endpoint enabled by +// default, 8 hops, 16K char limit. +func TestPageIndexDefaults(t *testing.T) { + t.Parallel() + cfg := Default() + if !cfg.Retrieval.PageIndex.Enabled { + t.Error("retrieval.pageindex.enabled should default to true (opt-out)") + } + if cfg.Retrieval.PageIndex.MaxHops != 8 { + t.Errorf("max_hops = %d, want 8", cfg.Retrieval.PageIndex.MaxHops) + } + if cfg.Retrieval.PageIndex.PageContentLimit != 16000 { + t.Errorf("page_content_limit = %d, want 16000", cfg.Retrieval.PageIndex.PageContentLimit) + } + if cfg.Retrieval.PageIndex.Model != "" { + t.Errorf("model default should be empty (inherit), got %q", cfg.Retrieval.PageIndex.Model) + } +} + +// TestPageIndexEnvOverride exercises every env knob the PageIndex +// block exposes. +func TestPageIndexEnvOverride(t *testing.T) { + prevEnabled := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED") + prevHops := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS") + prevLimit := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT") + prevModel := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MODEL") + defer func() { + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED", prevEnabled) + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", prevHops) + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", prevLimit) + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MODEL", prevModel) + }() + + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED", "false") + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", "12") + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", "32000") + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MODEL", "gemini-2.0-flash") + + cfg := Default() + applyEnvOverrides(&cfg) + + if cfg.Retrieval.PageIndex.Enabled { + t.Error("VLE_RETRIEVAL_PAGEINDEX_ENABLED=false should disable") + } + if cfg.Retrieval.PageIndex.MaxHops != 12 { + t.Errorf("max_hops = %d, want 12", cfg.Retrieval.PageIndex.MaxHops) + } + if cfg.Retrieval.PageIndex.PageContentLimit != 32000 { + t.Errorf("page_content_limit = %d, want 32000", cfg.Retrieval.PageIndex.PageContentLimit) + } + if cfg.Retrieval.PageIndex.Model != "gemini-2.0-flash" { + t.Errorf("model = %q, want gemini-2.0-flash", cfg.Retrieval.PageIndex.Model) + } +} + +// TestPageIndexEnvOverrideEnable: toggle on from an explicitly +// disabled state. +func TestPageIndexEnvOverrideEnable(t *testing.T) { + prev := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED") + defer os.Setenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED", prev) + + cfg := Default() + cfg.Retrieval.PageIndex.Enabled = false + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED", "true") + applyEnvOverrides(&cfg) + if !cfg.Retrieval.PageIndex.Enabled { + t.Error("VLE_RETRIEVAL_PAGEINDEX_ENABLED=true should enable from disabled") + } +} + +// TestPageIndexEnvOverrideRejectsBad: garbled numerics preserve the +// default rather than silently zeroing the cap. +func TestPageIndexEnvOverrideRejectsBad(t *testing.T) { + prevHops := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS") + prevLimit := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT") + defer func() { + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", prevHops) + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", prevLimit) + }() + + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", "abc") + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", "not-a-number") + + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Retrieval.PageIndex.MaxHops != 8 { + t.Errorf("garbage max_hops env should preserve default 8, got %d", cfg.Retrieval.PageIndex.MaxHops) + } + if cfg.Retrieval.PageIndex.PageContentLimit != 16000 { + t.Errorf("garbage page_content_limit env should preserve default, got %d", cfg.Retrieval.PageIndex.PageContentLimit) + } +} + +// TestValidatePageIndexNegatives: negatives rejected by Validate. +func TestValidatePageIndexNegatives(t *testing.T) { + t.Parallel() + cfg := Default() + cfg.Database.URL = "postgres://localhost/test" + cfg.Retrieval.PageIndex.MaxHops = -1 + if err := cfg.Validate(); err == nil { + t.Error("negative max_hops should fail validation") + } + + cfg2 := Default() + cfg2.Database.URL = "postgres://localhost/test" + cfg2.Retrieval.PageIndex.PageContentLimit = -1 + if err := cfg2.Validate(); err == nil { + t.Error("negative page_content_limit should fail validation") + } + + cfg3 := Default() + cfg3.Database.URL = "postgres://localhost/test" + cfg3.Retrieval.PageIndex.MaxHops = 0 + cfg3.Retrieval.PageIndex.PageContentLimit = 0 + if err := cfg3.Validate(); err != nil { + t.Errorf("zero values should pass (defaults applied at runtime): %v", err) + } +} + func TestValidateTLS(t *testing.T) { t.Parallel() diff --git a/pkg/retrieval/pageindex_strategy.go b/pkg/retrieval/pageindex_strategy.go new file mode 100644 index 0000000..6df2ed2 --- /dev/null +++ b/pkg/retrieval/pageindex_strategy.go @@ -0,0 +1,1083 @@ +package retrieval + +import ( + "context" + "encoding/json" + "fmt" + "log" + "sort" + "strconv" + "strings" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// PageIndexStrategy is a page-based agentic retrieval loop modelled on +// PageIndex's three-tool reasoning protocol. +// +// The model navigates by PAGE RANGE rather than by section ID. Each +// turn it emits one of: +// +// - get_document_structure() — returns the document's TOC tree +// (titles + page ranges only, no text), so the model can pick +// which pages to look at. +// - get_pages(start_page, end_page) — returns the concatenated text +// of every section whose [page_start, page_end] overlaps the +// requested range, clipped to PageContentLimit chars. +// - done(answer, cited_pages, reasoning) — terminates with the final +// answer string and the list of page ranges the answer relies on. +// +// This is a SUPERSET of the older AgenticStrategy's protocol: the +// loop owns the answer, not just the selection. SelectWithCost +// surfaces both the picked section IDs (the intersection of every +// cited page range with the document's section map) and the literal +// answer string via Result.Reasoning. The /v1/answer/pageindex +// endpoint reads the answer; the legacy /v1/query callers still get +// a section list. +// +// # Protocol choice +// +// PageIndex's original demo wires the model via the OpenAI Agents +// SDK's native tool-calling surface. llmgate v0.2.0 declares ToolDef +// / ToolCall as scaffolding but does not populate ToolCalls on +// responses, so this strategy uses the same JSON-action text +// protocol AgenticStrategy already proved (see pkg/retrieval/agentic.go). +// When llmgate wires native tool calling the surface here is the +// same — only the request/response plumbing changes. +type PageIndexStrategy struct { + // LLM is the shared client used for every turn. + LLM llmgate.Client + + // TOC is the source for get_document_structure observations. + // Implementations read documents.toc_tree (the column PR-A adds) + // or synthesise a tree from the section list. Nil triggers the + // built-in fallback that mirrors the section tree. + TOC TOCProvider + + // PageLoader materialises section content for get_pages + // observations. Nil disables the get_pages tool — the model + // would then only see structure observations. + PageLoader PageContentLoader + + // MaxHops caps the number of LLM turns one Select consumes, + // including the terminal "done" turn. Zero means use + // defaultPageIndexMaxHops. + MaxHops int + + // PageContentLimit caps how many chars a single get_pages + // observation returns. Zero means use defaultPageContentLimit. + // Limits like this keep one stray request from torching the + // context window: a 50-page get_pages on an SEC filing can + // easily blow past 200K chars otherwise. + PageContentLimit int + + // ModelOverride, if non-empty, replaces the budget's ModelName + // for every turn. Useful for routing the navigation loop to a + // cheaper or faster model than the rest of the engine. + ModelOverride string + + // OnEvent, when non-nil, is invoked synchronously once per + // tool call so callers (e.g. the /v1/answer/pageindex SSE + // handler) can stream the navigation in real time. The hook + // runs inside the loop, after the tool result is computed but + // before the next LLM hop. Implementations MUST be cheap and + // MUST NOT block; a blocked hook stalls retrieval. + OnEvent func(PageIndexEvent) +} + +// PageIndexEvent is a single observable step in the strategy's +// navigation loop. Consumers convert these to whatever wire format +// they need (SSE, gRPC stream, console log). +// +// Type carries the tool tag (get_document_structure / get_pages / +// done). For get_pages, StartPage/EndPage/CharCount/SectionIDs are +// populated; for done, Answer + CitedPages are populated. The Hop +// field is the 1-indexed turn number so consumers can interleave +// hops from concurrent requests. +type PageIndexEvent struct { + Hop int `json:"hop"` + Type string `json:"type"` + Reasoning string `json:"reasoning,omitempty"` + StartPage int `json:"start_page,omitempty"` + EndPage int `json:"end_page,omitempty"` + CharCount int `json:"char_count,omitempty"` + SectionIDs []tree.SectionID `json:"section_ids,omitempty"` + Answer string `json:"answer,omitempty"` + CitedPages [][2]int `json:"cited_pages,omitempty"` + Note string `json:"note,omitempty"` +} + +// defaultPageIndexMaxHops bounds the loop. Eight turns is enough for +// structure → 3 get_pages → done with two retry hops on stray bad +// JSON, while keeping latency and cost predictable. The reference +// PageIndex demo converges in 3-5 hops on typical questions. +const defaultPageIndexMaxHops = 8 + +// defaultPageContentLimit is the per-call chars cap. 16,000 chars +// is roughly 4K tokens at GPT/Claude tokenisers — comfortably below +// any flagship model's context but enough text for a 5-7 page +// excerpt. Matches PageIndex's reference behaviour. +const defaultPageContentLimit = 16000 + +// strategyNamePageIndex is the stable identifier for config +// (retrieval.strategy: pageindex) and telemetry. +const strategyNamePageIndex = "pageindex" + +// Compile-time interface checks. +var ( + _ Strategy = (*PageIndexStrategy)(nil) + _ CostStrategy = (*PageIndexStrategy)(nil) +) + +// TOCProvider returns a JSON document-structure tree for the LLM's +// get_document_structure tool. Implementations should return a +// pretty-printable JSON array/object representing titles + page +// ranges. Nodes that carry full text MUST be stripped before return — +// the model is supposed to navigate by structure first and pull text +// only via get_pages. +// +// Returning (nil, ErrNoTOC) signals "no TOC available; fall back to +// the synthesised view". Other errors propagate. +type TOCProvider interface { + GetTOC(ctx context.Context, docID tree.DocumentID) ([]byte, error) +} + +// PageContentLoader returns the raw content bytes for one section, +// keyed by its ContentRef. Strategies that need to materialise text +// at run-time depend on this rather than on a concrete storage +// driver — same shape as ContentFetcher; we keep them distinct so +// the two callers (agentic / pageindex) can be wired independently +// in main.go. +type PageContentLoader interface { + Load(ctx context.Context, ref string) ([]byte, error) +} + +// ErrNoTOC signals that no LLM-built TOC tree has been persisted for +// the document yet. The strategy treats it as a graceful-degrade +// signal: it synthesises a TOC view from the section list rather +// than failing the request. Pre-merge of PR-A (which adds +// documents.toc_tree) every request will degrade through this path. +var ErrNoTOC = fmt.Errorf("retrieval: no TOC tree persisted for document") + +// NewPageIndexStrategy constructs a PageIndexStrategy with sensible +// defaults. The TOC + PageLoader are nil here; the engine wires them +// in main.go from the DB pool + storage backend. Tests pass scripted +// implementations directly. +func NewPageIndexStrategy(client llmgate.Client) *PageIndexStrategy { + return &PageIndexStrategy{ + LLM: client, + MaxHops: defaultPageIndexMaxHops, + PageContentLimit: defaultPageContentLimit, + } +} + +// Name implements Strategy. +func (s *PageIndexStrategy) Name() string { return strategyNamePageIndex } + +// Select implements Strategy. +func (s *PageIndexStrategy) Select(ctx context.Context, t *tree.Tree, query string, budget ContextBudget) ([]tree.SectionID, error) { + r, err := s.SelectWithCost(ctx, t, query, budget) + if err != nil { + return nil, err + } + return r.SelectedIDs, nil +} + +// SelectWithCost implements CostStrategy. +// +// The returned Result populates: +// +// - SelectedIDs: section IDs whose [PageStart,PageEnd] overlaps any +// cited page range. This keeps the per-section-id contract for +// callers (/v1/query, /v1/answer) that don't yet know about pages. +// - Reasoning: the agent's final answer string (the "answer" field +// of the done action). /v1/answer/pageindex reads this directly +// and skips synthesis. +// - PagesRead: an entry per get_pages call. +// - HopsTaken / Usage / TraceToken: standard. +func (s *PageIndexStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, query string, budget ContextBudget) (*Result, error) { + if t == nil || t.Root == nil { + return &Result{}, nil + } + + model := s.ModelOverride + if model == "" { + model = budget.ModelName + } + maxHops := s.MaxHops + if maxHops <= 0 { + maxHops = defaultPageIndexMaxHops + } + pageLimit := s.PageContentLimit + if pageLimit <= 0 { + pageLimit = defaultPageContentLimit + } + + // Pre-flatten the tree into an ordinal section list ordered by + // page. The get_pages observation iterates this twice per call; + // pre-computing keeps the inner loop O(N) instead of O(N · depth). + sections := flattenSectionsByPage(t) + maxPage := maxKnownPage(sections) + + msgs := []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: pageIndexSystemPrompt}, + {Role: llmgate.RoleUser, Content: s.initialUserPrompt(t, query, maxPage)}, + } + + var ( + totalUsage Usage + hopsTaken int + pagesRead []PageReadEntry + + // finalAnswer / finalCitedPages / finalReasoning are populated + // when the model emits a done action. citedRanges drives the + // final SelectedIDs (section IDs overlapping any cited range). + finalAnswer string + finalReasoning string + citedRanges []pageRange + ) + + for hop := 0; hop < maxHops; hop++ { + req := llmgate.Request{ + Model: model, + Messages: msgs, + MaxTokens: 1536, // answers can be longer than agentic's selections + Temperature: 0, + } + resp, err := s.LLM.Complete(ctx, req) + if err != nil { + return nil, fmt.Errorf("pageindex hop %d: %w", hop+1, err) + } + hopsTaken++ + totalUsage.Add(Usage{ + InputTokens: resp.Usage.InputTokens, + OutputTokens: resp.Usage.OutputTokens, + TotalTokens: resp.Usage.TotalTokens, + CostUSD: resp.Usage.CostUSD, + LLMCalls: 1, + }) + + // Record the assistant turn before parsing so the next prompt + // has the model's own context (matches AgenticStrategy). + msgs = append(msgs, llmgate.Message{ + Role: llmgate.RoleAssistant, + Content: resp.Content, + }) + + action, parseErr := ParsePageIndexAction(resp.Content) + if parseErr != nil { + log.Printf("retrieval: pageindex hop %d action parse failed: %v", hop+1, parseErr) + msgs = append(msgs, llmgate.Message{ + Role: llmgate.RoleUser, + Content: pageIndexParseRetryPrompt, + }) + continue + } + + switch action.Action { + case pageActionDone: + finalAnswer = strings.TrimSpace(action.Answer) + finalReasoning = strings.TrimSpace(action.Reasoning) + citedRanges = normaliseRanges(action.CitedPages, maxPage) + selectedIDs := sectionsOverlapping(sections, citedRanges) + s.emit(PageIndexEvent{ + Hop: hopsTaken, + Type: pageActionDone, + Reasoning: finalReasoning, + Answer: finalAnswer, + CitedPages: action.CitedPages, + }) + return &Result{ + SelectedIDs: selectedIDs, + Reasoning: finalAnswer, // /v1/answer/pageindex reads this + ModelUsed: model, + Usage: totalUsage, + HopsTaken: hopsTaken, + PagesRead: pagesRead, + TraceToken: computePageIndexTraceToken(t.DocumentID, model, citedRanges), + }, nil + + case pageActionStructure: + obs := s.renderStructure(ctx, t) + msgs = append(msgs, llmgate.Message{ + Role: llmgate.RoleUser, + Content: wrapPageObservation("get_document_structure", obs), + }) + s.emit(PageIndexEvent{ + Hop: hopsTaken, + Type: pageActionStructure, + Reasoning: action.Reasoning, + CharCount: len(obs), + }) + + case pageActionGetPages: + start, end, ok := clampRange(action.StartPage, action.EndPage, maxPage) + if !ok { + msgs = append(msgs, llmgate.Message{ + Role: llmgate.RoleUser, + Content: wrapPageObservation("get_pages", + fmt.Sprintf("invalid range start=%d end=%d (document has %d pages). Pages are 1-indexed inclusive.", + action.StartPage, action.EndPage, maxPage)), + }) + s.emit(PageIndexEvent{ + Hop: hopsTaken, + Type: pageActionGetPages, + Reasoning: action.Reasoning, + StartPage: action.StartPage, + EndPage: action.EndPage, + Note: "invalid range", + }) + continue + } + text, sectionIDs := s.renderPages(ctx, sections, start, end, pageLimit) + pagesRead = append(pagesRead, PageReadEntry{ + StartPage: start, + EndPage: end, + SectionIDs: sectionIDs, + CharCount: len(text), + }) + msgs = append(msgs, llmgate.Message{ + Role: llmgate.RoleUser, + Content: wrapPageObservation("get_pages", + fmt.Sprintf("pages %d-%d (%d sections, %d chars):\n%s", start, end, len(sectionIDs), len(text), text)), + }) + s.emit(PageIndexEvent{ + Hop: hopsTaken, + Type: pageActionGetPages, + Reasoning: action.Reasoning, + StartPage: start, + EndPage: end, + CharCount: len(text), + SectionIDs: sectionIDs, + }) + + default: + msgs = append(msgs, llmgate.Message{ + Role: llmgate.RoleUser, + Content: wrapPageObservation(action.Action, + fmt.Sprintf("unsupported tool %q. Use one of: get_document_structure, get_pages, done.", action.Action)), + }) + s.emit(PageIndexEvent{ + Hop: hopsTaken, + Type: action.Action, + Note: "unsupported tool", + }) + } + } + + // Ran out of hops without a done action. Force a terminal turn: + // give the model one final chance with an explicit "you MUST emit + // done now" prompt. If that also fails to parse or the model + // ignores the rule, we return whatever pages have been read so + // the caller at least sees the navigation footprint and an empty + // answer rather than a 500. + finalAnswer, finalReasoning, citedRanges = s.forceDone(ctx, &msgs, &totalUsage, &hopsTaken, model, maxPage) + selectedIDs := sectionsOverlapping(sections, citedRanges) + log.Printf("retrieval: pageindex strategy hit max_hops=%d; forced done", maxHops) + _ = finalReasoning + return &Result{ + SelectedIDs: selectedIDs, + Reasoning: finalAnswer, + ModelUsed: model, + Usage: totalUsage, + HopsTaken: hopsTaken, + PagesRead: pagesRead, + TraceToken: computePageIndexTraceToken(t.DocumentID, model, citedRanges), + }, nil +} + +// emit dispatches one event to the registered OnEvent hook, if any. +// Hooks run synchronously inside the navigation loop and MUST be +// cheap; callers that need to do I/O should buffer first and write +// outside the strategy's critical path. +func (s *PageIndexStrategy) emit(ev PageIndexEvent) { + if s.OnEvent != nil { + s.OnEvent(ev) + } +} + +// initialUserPrompt is the very first user turn. It explains the +// task, tells the model which page range exists ("the document has N +// pages"), and reminds it of the action protocol. Mirrors +// AgenticStrategy.initialUserPrompt. +func (s *PageIndexStrategy) initialUserPrompt(t *tree.Tree, query string, maxPage int) string { + var b strings.Builder + if t.Title != "" { + b.WriteString("Document: ") + b.WriteString(t.Title) + b.WriteString("\n") + } + if maxPage > 0 { + fmt.Fprintf(&b, "Pages: 1-%d (inclusive)\n", maxPage) + } else { + b.WriteString("Pages: unknown (this document carries no page metadata; rely on get_document_structure for navigation hints).\n") + } + b.WriteString("\nUser query:\n") + b.WriteString(query) + b.WriteString("\n\nReply with a JSON action. The tools you may use are:\n") + b.WriteString(pageIndexActionHelp) + return b.String() +} + +// renderStructure produces the get_document_structure observation. +// First tries the persisted TOC tree (PR-A's documents.toc_tree +// JSONB); if that's nil or errors, falls back to a synthesised view +// derived from the section list. The fallback keeps this strategy +// useful even before PR-A merges. +func (s *PageIndexStrategy) renderStructure(ctx context.Context, t *tree.Tree) string { + if s.TOC != nil { + raw, err := s.TOC.GetTOC(ctx, t.DocumentID) + if err == nil && len(raw) > 0 { + return string(raw) + } + // Log and degrade — the strategy must keep going. + if err != nil { + log.Printf("retrieval: pageindex TOC fetch failed (degrading to synthesised view): %v", err) + } + } + return synthesiseTOC(t) +} + +// renderPages assembles the get_pages observation: concatenates the +// content of every section whose page range overlaps [start, end], +// clipped to pageLimit. Returns the rendered text plus the list of +// section IDs that contributed, in page order. SectionIDs feeds back +// into the PageReadEntry so callers can audit which sections the +// model actually read. +func (s *PageIndexStrategy) renderPages(ctx context.Context, sections []sectionPageEntry, start, end, pageLimit int) (string, []tree.SectionID) { + if s.PageLoader == nil { + // Without a loader we can still emit a useful observation + // from titles + summaries, so the model can keep navigating. + return s.renderPagesNoLoader(sections, start, end, pageLimit) + } + + var ( + b strings.Builder + sectionIDs []tree.SectionID + written int + ) + for _, sec := range sections { + if !overlaps(sec.start, sec.end, start, end) { + continue + } + sectionIDs = append(sectionIDs, sec.id) + + // Header line so the model can ground its citations to a + // specific section + page range. + header := fmt.Sprintf("\n--- section_id=%s title=%q pages=%d-%d ---\n", sec.id, sec.title, sec.start, sec.end) + remaining := pageLimit - written + if remaining <= 0 { + break + } + if len(header) > remaining { + b.WriteString(header[:remaining]) + written += remaining + break + } + b.WriteString(header) + written += len(header) + + // Body — preferred source: storage via PageLoader. Fall back + // to the section summary when there's no ContentRef (internal + // nodes) or the loader errors. + body := s.loadSectionBody(ctx, sec) + remaining = pageLimit - written + if remaining <= 0 { + break + } + if len(body) > remaining { + b.WriteString(body[:remaining]) + written += remaining + break + } + b.WriteString(body) + written += len(body) + } + return b.String(), sectionIDs +} + +// renderPagesNoLoader is the degraded-mode get_pages observation +// used when the strategy has no PageLoader (e.g. in tests, or when +// storage is wired but momentarily unavailable). Titles + summaries +// still let the model triangulate which range to ask about next. +func (s *PageIndexStrategy) renderPagesNoLoader(sections []sectionPageEntry, start, end, pageLimit int) (string, []tree.SectionID) { + var ( + b strings.Builder + sectionIDs []tree.SectionID + ) + for _, sec := range sections { + if !overlaps(sec.start, sec.end, start, end) { + continue + } + sectionIDs = append(sectionIDs, sec.id) + fmt.Fprintf(&b, "section_id=%s title=%q pages=%d-%d summary=%q\n", sec.id, sec.title, sec.start, sec.end, sec.summary) + if b.Len() >= pageLimit { + break + } + } + out := b.String() + if len(out) > pageLimit { + out = out[:pageLimit] + } + return out, sectionIDs +} + +func (s *PageIndexStrategy) loadSectionBody(ctx context.Context, sec sectionPageEntry) string { + if sec.contentRef == "" { + if sec.summary != "" { + return fmt.Sprintf("(summary, no content loaded)\n%s", sec.summary) + } + return "" + } + data, err := s.PageLoader.Load(ctx, sec.contentRef) + if err != nil { + log.Printf("retrieval: pageindex load failed for section %s: %v", sec.id, err) + if sec.summary != "" { + return fmt.Sprintf("(content load failed: %v; using summary)\n%s", err, sec.summary) + } + return fmt.Sprintf("(content load failed: %v)", err) + } + return strings.TrimSpace(string(data)) +} + +// forceDone runs one final hop with a hard "emit done NOW" prompt so +// the loop can exit gracefully on a stubborn model. Returns +// (answer, reasoning, cited_ranges). When the model still doesn't +// emit a valid done action, the empty values flow back and the +// caller sees a hop-capped Result. +func (s *PageIndexStrategy) forceDone(ctx context.Context, msgs *[]llmgate.Message, totalUsage *Usage, hopsTaken *int, model string, maxPage int) (string, string, []pageRange) { + *msgs = append(*msgs, llmgate.Message{ + Role: llmgate.RoleUser, + Content: "You have used your tool-call budget. Reply NOW with one JSON object: {\"tool\":\"done\",\"answer\":\" max { + max = s.end + } + } + return max +} + +// overlaps reports whether two inclusive ranges intersect. +func overlaps(aStart, aEnd, bStart, bEnd int) bool { + if aStart <= 0 || aEnd <= 0 || bStart <= 0 || bEnd <= 0 { + return false + } + return aStart <= bEnd && bStart <= aEnd +} + +// clampRange validates a model-emitted [start,end] against the +// document's actual page range. Returns (start, end, ok=false) when +// the range is unusable (zero pages, inverted, or entirely past the +// document). When the range partially overlaps the document the ends +// are clamped to [1, maxPage] and the call returns ok=true so the +// model can keep navigating from a slightly-corrected range rather +// than spinning on the same error. +func clampRange(start, end, maxPage int) (int, int, bool) { + if start <= 0 && end <= 0 { + return 0, 0, false + } + if start <= 0 { + start = 1 + } + if end <= 0 { + end = start + } + if start > end { + start, end = end, start + } + if maxPage > 0 { + if start > maxPage { + return start, end, false + } + if end > maxPage { + end = maxPage + } + } + return start, end, true +} + +// normaliseRanges collapses raw model-emitted ranges (which may be +// flipped, zero-pages, or duplicated) into a sorted, deduplicated +// list of valid inclusive ranges clamped to [1,maxPage]. Bad ranges +// are silently dropped — the trace token must compute over a stable +// canonical form regardless of how the model orders its citations. +func normaliseRanges(raw [][2]int, maxPage int) []pageRange { + if len(raw) == 0 { + return nil + } + seen := make(map[pageRange]struct{}, len(raw)) + out := make([]pageRange, 0, len(raw)) + for _, r := range raw { + s, e, ok := clampRange(r[0], r[1], maxPage) + if !ok { + continue + } + pr := pageRange{Start: s, End: e} + if _, dup := seen[pr]; dup { + continue + } + seen[pr] = struct{}{} + out = append(out, pr) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Start != out[j].Start { + return out[i].Start < out[j].Start + } + return out[i].End < out[j].End + }) + return out +} + +// sectionsOverlapping returns the IDs of every section whose +// [PageStart, PageEnd] overlaps any of the cited ranges. Preserves +// document order (because sections is page-sorted) and deduplicates. +// This is the bridge that turns the model's page-based citations +// into the section-ID list every other endpoint already expects. +func sectionsOverlapping(sections []sectionPageEntry, ranges []pageRange) []tree.SectionID { + if len(ranges) == 0 || len(sections) == 0 { + return nil + } + seen := make(map[tree.SectionID]struct{}, len(sections)) + out := make([]tree.SectionID, 0, len(sections)) + for _, sec := range sections { + for _, r := range ranges { + if overlaps(sec.start, sec.end, r.Start, r.End) { + if _, dup := seen[sec.id]; !dup { + seen[sec.id] = struct{}{} + out = append(out, sec.id) + } + break + } + } + } + return out +} + +// computePageIndexTraceToken builds the replay token for a +// PageIndex run. Page-based strategies don't pick section IDs the +// way agentic/single-pass do, so the token's "identity" inputs are +// the document, the model, and the sorted cited page ranges. Two +// runs that cite the same pages (even via different navigation +// paths) collapse to the same token — same property as +// ComputeTraceToken offers for section IDs. +// +// The hashing primitive (sha256, NUL separators, lowercase hex) is +// reused so /v1/replay handles both shapes uniformly. +func computePageIndexTraceToken(docID tree.DocumentID, model string, ranges []pageRange) string { + strs := make([]string, len(ranges)) + for i, r := range ranges { + strs[i] = r.String() + } + sort.Strings(strs) + // Trace-token IDs are constructed from sorted page-range strings + // rather than section IDs. We feed them through the existing + // ComputeTraceToken helper for shape consistency — its + // sort-then-hash semantics happens to be exactly what we want + // here too. The strategy's stable identifier ("pageindex") is + // folded into the "model" position so a page-based run and a + // section-based run on the same doc/model don't collide. + tagged := make([]tree.SectionID, len(strs)) + for i, s := range strs { + tagged[i] = tree.SectionID("p:" + s) + } + return ComputeTraceToken(docID, traceDocVersionV1+"-pages", strategyNamePageIndex+":"+model, tagged) +} + +// --- action protocol --- + +// PageIndexAction is the LLM-chosen next step in the loop. The model +// emits one of these per turn as a JSON object on the +// 'tool' tag. The Action field is uppercase-tolerant on input; +// ParsePageIndexAction lowercases before dispatch. +type PageIndexAction struct { + // Action is the dispatch tag (alias: tool). One of: + // get_document_structure, get_pages, done. + Action string `json:"tool"` + + // ActionAlt lets the model use "action" instead of "tool". Some + // providers struggle to consistently emit the same key when both + // shapes are documented. We accept either; ActionAlt wins iff + // Action is empty. + ActionAlt string `json:"action,omitempty"` + + // StartPage / EndPage are the inclusive 1-indexed range a + // get_pages call targets. + StartPage int `json:"start_page,omitempty"` + EndPage int `json:"end_page,omitempty"` + + // Pages is an alternate shape some models reach for: a + // "5-7"-style string. ParsePageIndexAction splits it into + // StartPage/EndPage when present. + Pages string `json:"pages,omitempty"` + + // Answer is the natural-language answer for a done action. + Answer string `json:"answer,omitempty"` + + // CitedPages is the list of inclusive page ranges the answer + // relies on for a done action. Each entry is [start, end]; a + // single page can be expressed as [5,5]. + CitedPages [][2]int `json:"cited_pages,omitempty"` + + // Reasoning is the per-call explanation the system prompt + // asks the model to emit. Surfaced into the reasoning_trace + // when the endpoint is called with ?reasoning=true. + Reasoning string `json:"reasoning,omitempty"` +} + +// Action tag constants. Mirrors PageIndex's reference SDK tool +// names so prompt-engineering work over there translates over. +const ( + pageActionStructure = "get_document_structure" + pageActionGetPages = "get_pages" + pageActionDone = "done" +) + +// pageIndexParseRetryPrompt nudges the model back onto the +// JSON-action protocol after a parse failure. Aligned with +// AgenticStrategy's retry path — same wording so behaviour stays +// consistent. +const pageIndexParseRetryPrompt = "Your last reply was not a valid JSON tool call. Reply with EXACTLY one JSON object: {\"tool\":\"get_document_structure|get_pages|done\", ...}. No prose, no markdown fences." + +// ParsePageIndexAction is the tolerant JSON decoder for the +// page-based protocol. Behaviour mirrors ParseAction (the older +// agentic protocol's parser): strip code fences, peel prose +// wrappers, isolate the first balanced JSON object, and +// case-fold the action tag. +// +// Additional tolerance vs ParseAction: +// - "tool" or "action" can name the action. +// - Pages can be a "5-7" string instead of explicit +// start_page/end_page. +// - cited_pages can be either [[5,7],[10,10]] (preferred) or +// ["5-7","10"] (tolerated). +func ParsePageIndexAction(raw string) (PageIndexAction, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return PageIndexAction{}, fmt.Errorf("empty response") + } + if strings.HasPrefix(raw, "```") { + if i := strings.Index(raw, "\n"); i >= 0 { + raw = raw[i+1:] + } + raw = strings.TrimSuffix(raw, "```") + raw = strings.TrimSpace(raw) + } + if i := strings.Index(raw, "{"); i > 0 { + raw = raw[i:] + } + if j := strings.LastIndex(raw, "}"); j >= 0 && j < len(raw)-1 { + raw = raw[:j+1] + } + + // We decode in two passes so a flexibly-typed cited_pages field + // (either [[1,2],[5,7]] or ["1-2","5-7"]) doesn't tank the whole + // action. + // + // Pass 1: decode into a map[string]json.RawMessage so each field + // can be parsed independently. This is more tolerant than a + // single-pass typed decode because a single bad field doesn't + // invalidate the rest of the JSON. + var fields map[string]json.RawMessage + if err := json.Unmarshal([]byte(raw), &fields); err != nil { + return PageIndexAction{}, fmt.Errorf("decode pageindex action: %w", err) + } + + var a PageIndexAction + if v, ok := fields["tool"]; ok { + _ = json.Unmarshal(v, &a.Action) + } + if a.Action == "" { + if v, ok := fields["action"]; ok { + _ = json.Unmarshal(v, &a.Action) + } + } + a.Action = strings.ToLower(strings.TrimSpace(a.Action)) + if a.Action == "" { + return PageIndexAction{}, fmt.Errorf("missing 'tool' or 'action' field") + } + + if v, ok := fields["start_page"]; ok { + _ = json.Unmarshal(v, &a.StartPage) + } + if v, ok := fields["end_page"]; ok { + _ = json.Unmarshal(v, &a.EndPage) + } + if v, ok := fields["pages"]; ok { + _ = json.Unmarshal(v, &a.Pages) + } + if v, ok := fields["answer"]; ok { + _ = json.Unmarshal(v, &a.Answer) + } + if v, ok := fields["reasoning"]; ok { + _ = json.Unmarshal(v, &a.Reasoning) + } + + // cited_pages: try the typed shape first ([[1,2],[5,7]]); fall + // back to the string-shape (["1-2","5-7"]) when the typed + // decode fails or is empty. + if v, ok := fields["cited_pages"]; ok && len(v) > 0 { + if err := json.Unmarshal(v, &a.CitedPages); err != nil || len(a.CitedPages) == 0 { + a.CitedPages = nil + var asStrings []string + if err := json.Unmarshal(v, &asStrings); err == nil { + for _, p := range asStrings { + s, e, ok := parsePageRangeString(p) + if !ok { + continue + } + a.CitedPages = append(a.CitedPages, [2]int{s, e}) + } + } + } + } + + // Pages-string → start/end normalisation. Only fills in when + // the typed fields weren't already populated. + if a.Pages != "" && a.StartPage == 0 && a.EndPage == 0 { + s, e, ok := parsePageRangeString(a.Pages) + if ok { + a.StartPage = s + a.EndPage = e + } + } + + return a, nil +} + +// parsePageRangeString parses "5", "5-7", or "5,7" (the loosest +// shape the model is allowed to emit). Returns (start, end, true) +// on success; (0, 0, false) otherwise. "5,7" is treated as +// start=5,end=7 (we don't support multi-range here — that's what +// cited_pages is for). +func parsePageRangeString(s string) (int, int, bool) { + s = strings.TrimSpace(s) + if s == "" { + return 0, 0, false + } + sep := -1 + for i, c := range s { + if c == '-' || c == ',' { + sep = i + break + } + } + if sep < 0 { + n, err := strconv.Atoi(s) + if err != nil || n <= 0 { + return 0, 0, false + } + return n, n, true + } + a, err1 := strconv.Atoi(strings.TrimSpace(s[:sep])) + b, err2 := strconv.Atoi(strings.TrimSpace(s[sep+1:])) + if err1 != nil || err2 != nil || a <= 0 || b <= 0 { + return 0, 0, false + } + return a, b, true +} + +// wrapPageObservation formats a tool's result so the model can +// clearly see which tool produced which observation. Same shape as +// AgenticStrategy.wrapObservation but with tool-call wording. +func wrapPageObservation(tool, body string) string { + return fmt.Sprintf("Tool result (%s):\n%s\n\nNext JSON tool call?", tool, body) +} + +// --- system prompt --- + +// pageIndexSystemPrompt instructs the model on the navigation loop. +// The wording is a faithful port of the reference PageIndex demo's +// AGENT_SYSTEM_PROMPT (see PageIndex/examples/agentic_vectorless_rag_demo.py:44-52), +// adapted to the JSON-action protocol vle uses in lieu of native +// llmgate tool calling. +// +// Key invariants that show up in tests: +// - Always call get_document_structure first. +// - Use tight page ranges; never fetch the whole document. +// - Emit a one-sentence reason before each tool call. +// - Answer only from tool output (no priors). +// - End with a done action carrying answer + cited_pages. +const pageIndexSystemPrompt = `You are a document QA assistant navigating a paginated document. + +TOOL USE PROTOCOL: +- Reply with EXACTLY one JSON object per turn. No prose, no markdown fences. +- Always call get_document_structure first to see titles + page ranges. +- Call get_pages with TIGHT page ranges (e.g. {"tool":"get_pages","start_page":5,"end_page":7}). Never fetch the whole document. +- Before each tool call, populate the "reasoning" field with ONE short sentence explaining why you're calling it. +- When you have enough evidence, emit done with the natural-language answer, the page ranges you relied on, and a one-line reasoning trace. + +RULES: +- Answer based ONLY on tool output. Do not invent facts. +- Cite by page range, not by section title. +- Be concise. Single-paragraph answers when possible. +- If nothing in the document answers the query, emit done with answer="The document does not address this query." and an empty cited_pages array.` + +// pageIndexActionHelp is the one-shot reminder appended to the +// initial user prompt so the model gets concrete examples without us +// needing to maintain a separate few-shot block. +const pageIndexActionHelp = `- {"tool":"get_document_structure","reasoning":"orient by titles"} — fetch the TOC tree (titles + page ranges, no body text) +- {"tool":"get_pages","start_page":5,"end_page":7,"reasoning":"section on debt"} — fetch text covering pages 5-7 +- {"tool":"done","answer":"...","cited_pages":[[5,7],[12,12]],"reasoning":"the answer is grounded on these pages"} — final answer + +Reply with ONLY the JSON object.` diff --git a/pkg/retrieval/pageindex_strategy_test.go b/pkg/retrieval/pageindex_strategy_test.go new file mode 100644 index 0000000..e6b9e0d --- /dev/null +++ b/pkg/retrieval/pageindex_strategy_test.go @@ -0,0 +1,750 @@ +package retrieval_test + +import ( + "context" + "errors" + "strings" + "sync" + "sync/atomic" + "testing" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/retrieval" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// pageScriptedLLM is a scriptedLLM for the PageIndex strategy. +// Each Complete call returns the next canned response. When the +// script is exhausted, loopReply (if set) is returned on every +// subsequent call — the hop-cap test uses this to simulate a model +// that never emits done. +type pageScriptedLLM struct { + replies []string + loopReply string + + calls int32 + + mu sync.Mutex + lastPrompts []string +} + +func (p *pageScriptedLLM) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + i := int(atomic.AddInt32(&p.calls, 1)) - 1 + + var userMsg string + for _, msg := range req.Messages { + if msg.Role == llmgate.RoleUser { + userMsg = msg.Content + } + } + p.mu.Lock() + p.lastPrompts = append(p.lastPrompts, userMsg) + p.mu.Unlock() + + if i < len(p.replies) { + return &llmgate.Response{Content: p.replies[i]}, nil + } + if p.loopReply != "" { + return &llmgate.Response{Content: p.loopReply}, nil + } + return nil, errors.New("pageScriptedLLM: replies exhausted") +} + +func (p *pageScriptedLLM) CountTokens(ctx context.Context, t string) (int, error) { + return len(t) / 4, nil +} + +// pageMapLoader is an in-memory PageContentLoader backed by a map. +type pageMapLoader struct{ data map[string]string } + +func (m pageMapLoader) Load(ctx context.Context, ref string) ([]byte, error) { + v, ok := m.data[ref] + if !ok { + return nil, errors.New("not found") + } + return []byte(v), nil +} + +// pageStaticTOC is a TOCProvider that returns a canned JSON blob. +// Tests use this to assert the get_document_structure observation +// surfaces the persisted TOC ahead of the synthesised fallback. +type pageStaticTOC struct{ blob []byte } + +func (p pageStaticTOC) GetTOC(ctx context.Context, _ tree.DocumentID) ([]byte, error) { + return p.blob, nil +} + +// pageErroringTOC simulates documents.toc_tree being NULL (no +// LLM-built TOC yet). The strategy must degrade to the synthesised +// view rather than failing the request. +type pageErroringTOC struct{} + +func (pageErroringTOC) GetTOC(ctx context.Context, _ tree.DocumentID) ([]byte, error) { + return nil, retrieval.ErrNoTOC +} + +// buildPagedTree mirrors buildAgenticTree but stamps page_start / +// page_end on every section so PageIndexStrategy can navigate. The +// shape: +// +// sec_root → [sec_a (1-4), sec_b (5-9)] +// sec_a → [sec_a1 (1-2 install), sec_a2 (3-4 config)] +// sec_b → [sec_b1 (5-7 querying), sec_b2 (8-9 debt)] +func buildPagedTree() *tree.Tree { + a1 := &tree.Section{ID: "sec_a1", ParentID: "sec_a", Title: "Install", Summary: "install steps", ContentRef: "a1_ref", PageStart: 1, PageEnd: 2} + a2 := &tree.Section{ID: "sec_a2", ParentID: "sec_a", Title: "Config", Summary: "config keys", ContentRef: "a2_ref", PageStart: 3, PageEnd: 4} + b1 := &tree.Section{ID: "sec_b1", ParentID: "sec_b", Title: "Querying", Summary: "how to query", ContentRef: "b1_ref", PageStart: 5, PageEnd: 7} + b2 := &tree.Section{ID: "sec_b2", ParentID: "sec_b", Title: "Debt", Summary: "long-term debt", ContentRef: "b2_ref", PageStart: 8, PageEnd: 9} + a := &tree.Section{ID: "sec_a", ParentID: "sec_root", Title: "Setup", Summary: "setup section", Children: []*tree.Section{a1, a2}, PageStart: 1, PageEnd: 4} + b := &tree.Section{ID: "sec_b", ParentID: "sec_root", Title: "Usage", Summary: "usage section", Children: []*tree.Section{b1, b2}, PageStart: 5, PageEnd: 9} + root := &tree.Section{ID: "sec_root", Title: "Atlas", Children: []*tree.Section{a, b}, PageStart: 1, PageEnd: 9} + return &tree.Tree{DocumentID: "doc_x", Title: "Atlas", Root: root} +} + +// TestPageIndexHappyPath drives the canonical 3-tool sequence: +// structure → get_pages → done. We assert the strategy: +// - returns the answer string in Result.Reasoning +// - lists the section IDs whose page range overlaps the citation +// - records the get_pages call in PagesRead +// - tracks HopsTaken correctly +// - computes a non-empty TraceToken keyed by the cited pages +func TestPageIndexHappyPath(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_document_structure","reasoning":"orient"}`, + `{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"install lives near the front"}`, + `{"tool":"done","answer":"Run vle ingest then start the server.","cited_pages":[[1,2]],"reasoning":"install steps live on pages 1-2"}`, + }, + } + loader := pageMapLoader{data: map[string]string{ + "a1_ref": "Install steps: run vle ingest...", + "a2_ref": "Config keys: VLE_*", + "b1_ref": "How to query the API.", + "b2_ref": "Debt registration is in line items A and B.", + }} + + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = loader + + res, err := s.SelectWithCost(context.Background(), tr, "how do I install?", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.HopsTaken != 3 { + t.Errorf("HopsTaken = %d, want 3", res.HopsTaken) + } + if res.Usage.LLMCalls != 3 { + t.Errorf("Usage.LLMCalls = %d, want 3", res.Usage.LLMCalls) + } + if !strings.Contains(res.Reasoning, "vle ingest") { + t.Errorf("Reasoning (answer) must contain the model's reply, got %q", res.Reasoning) + } + if len(res.SelectedIDs) == 0 { + t.Fatalf("SelectedIDs must include sections covering pages 1-2, got %v", res.SelectedIDs) + } + // sec_a1 (1-2) is the leaf — must be in the list. sec_a (1-4) + // and the synthetic sec_root (1-9) overlap too because page + // ranges intersect. The strategy's job is to surface ANY section + // whose [page_start,page_end] overlaps the citation; the API + // layer narrows further if it cares. + wantIDs := map[tree.SectionID]bool{"sec_a1": true, "sec_a": true, "sec_root": true} + for _, id := range res.SelectedIDs { + if !wantIDs[id] { + t.Errorf("unexpected section ID %q (only sections overlapping pages 1-2 may appear)", id) + } + } + if _, ok := indexOfSection(res.SelectedIDs, "sec_a1"); !ok { + t.Errorf("sec_a1 must be in SelectedIDs, got %v", res.SelectedIDs) + } + if len(res.PagesRead) != 1 { + t.Fatalf("PagesRead = %v, want 1 entry", res.PagesRead) + } + if res.PagesRead[0].StartPage != 1 || res.PagesRead[0].EndPage != 2 { + t.Errorf("PagesRead[0] = %+v, want 1-2", res.PagesRead[0]) + } + if res.PagesRead[0].CharCount == 0 { + t.Errorf("PagesRead[0].CharCount must be non-zero, got %d", res.PagesRead[0].CharCount) + } + if res.TraceToken == "" { + t.Errorf("TraceToken must be populated on success") + } + + // Assert the second prompt — the one that follows the + // get_document_structure call — actually surfaced the + // synthesised TOC (since no TOC provider was wired). It must + // contain section titles. + llm.mu.Lock() + defer llm.mu.Unlock() + if len(llm.lastPrompts) < 2 { + t.Fatalf("expected at least 2 prompts captured, got %d", len(llm.lastPrompts)) + } + if !strings.Contains(llm.lastPrompts[1], "Install") { + t.Errorf("get_document_structure observation should include section titles; got:\n%s", llm.lastPrompts[1]) + } + if !strings.Contains(llm.lastPrompts[2], "Install steps: run vle ingest") { + t.Errorf("get_pages observation should include loaded content; got:\n%s", llm.lastPrompts[2]) + } +} + +// TestPageIndexMultiRangeDone covers a done with two cited ranges: +// the strategy must surface every section that overlaps EITHER +// range. This is the FinanceBench-shaped pattern: an answer that +// pulls evidence from two unrelated parts of a 10-K. +func TestPageIndexMultiRangeDone(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_document_structure"}`, + `{"tool":"get_pages","start_page":3,"end_page":4}`, + `{"tool":"get_pages","start_page":8,"end_page":9}`, + `{"tool":"done","answer":"Config is X. Debt is Y.","cited_pages":[[3,4],[8,9]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{ + "a2_ref": "Config keys: VLE_*", + "b2_ref": "Debt registration is in line items A and B.", + }} + + res, err := s.SelectWithCost(context.Background(), tr, "config and debt?", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.HopsTaken != 4 { + t.Errorf("HopsTaken = %d, want 4", res.HopsTaken) + } + if len(res.PagesRead) != 2 { + t.Fatalf("PagesRead = %v, want 2 entries", res.PagesRead) + } + wantSecs := map[tree.SectionID]bool{ + "sec_a2": true, "sec_b2": true, // direct leaf overlaps + "sec_a": true, "sec_b": true, // parents overlap too + "sec_root": true, // doc-wide root overlaps every range + } + got := make(map[tree.SectionID]bool, len(res.SelectedIDs)) + for _, id := range res.SelectedIDs { + got[id] = true + if !wantSecs[id] { + t.Errorf("unexpected section ID %q", id) + } + } + // Leaves are the load-bearing requirement; parents are + // allowed-not-required (a future tightening could skip them, and + // the strategy contract stays useful either way). + for _, id := range []tree.SectionID{"sec_a2", "sec_b2"} { + if !got[id] { + t.Errorf("missing section ID %q from SelectedIDs", id) + } + } +} + +// TestPageIndexMaxHopsForcesDone confirms a runaway loop is killed: +// the model emits get_pages on every turn but never done. The +// strategy must cap at MaxHops, force a done on the last hop, and +// surface a Result with HopsTaken == MaxHops+1 (the +1 for the +// forced terminal call). +func TestPageIndexMaxHopsForcesDone(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + // Every loop reply is a fresh get_pages — never done. + loopReply: `{"tool":"get_pages","start_page":1,"end_page":2}`, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{"a1_ref": "install"}} + s.MaxHops = 3 + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.HopsTaken < 3 { + t.Errorf("HopsTaken = %d, want >= 3 (cap hit)", res.HopsTaken) + } + // The model never emits done so even after force-done attempt + // the answer should be empty (force-done's response is also a + // get_pages, which fails to parse as done). + if strings.TrimSpace(res.Reasoning) != "" { + t.Errorf("answer must be empty when model never finalises, got %q", res.Reasoning) + } + // The get_pages calls that fired BEFORE the cap should still be + // surfaced in PagesRead so callers can see what the model tried. + if len(res.PagesRead) == 0 { + t.Error("PagesRead must capture pre-cap navigation footprint") + } +} + +// TestPageIndexMaxHopsForceDoneSucceeds covers the recovery path: +// the loop hit MaxHops, but on the forced-done turn the model +// actually emits a valid done. The strategy must collect the +// answer + citations from that final turn rather than dropping them. +func TestPageIndexMaxHopsForceDoneSucceeds(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_pages","start_page":1,"end_page":2}`, + `{"tool":"get_pages","start_page":3,"end_page":4}`, + // Once force-done fires, this becomes the model's response. + `{"tool":"done","answer":"forced answer","cited_pages":[[1,2]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{"a1_ref": "install", "a2_ref": "config"}} + s.MaxHops = 2 + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.Reasoning != "forced answer" { + t.Errorf("forced-done answer = %q, want %q", res.Reasoning, "forced answer") + } + if len(res.SelectedIDs) == 0 { + t.Error("forced-done citations must populate SelectedIDs") + } +} + +// TestPageIndexTOCFallback exercises the graceful-degradation path: +// when the persisted TOC provider returns ErrNoTOC (pre-PR-A +// state), the strategy synthesises a TOC view from the section +// tree. The model must still receive section titles + page ranges. +func TestPageIndexTOCFallback(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_document_structure"}`, + `{"tool":"done","answer":"see structure","cited_pages":[]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + s.TOC = pageErroringTOC{} // mimic documents.toc_tree IS NULL + + res, err := s.SelectWithCost(context.Background(), tr, "what's in the doc?", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.HopsTaken != 2 { + t.Errorf("HopsTaken = %d, want 2", res.HopsTaken) + } + + llm.mu.Lock() + defer llm.mu.Unlock() + if len(llm.lastPrompts) < 2 { + t.Fatalf("expected 2 prompts, got %d", len(llm.lastPrompts)) + } + // The fallback synthesised TOC must include each leaf title. + obs := llm.lastPrompts[1] + for _, want := range []string{"Install", "Config", "Querying", "Debt", "page_start"} { + if !strings.Contains(obs, want) { + t.Errorf("synthesised TOC missing %q in observation:\n%s", want, obs) + } + } +} + +// TestPageIndexTOCFromProvider asserts the persisted TOC wins over +// the synthesised view: when the provider returns bytes, those +// bytes are surfaced verbatim. +func TestPageIndexTOCFromProvider(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_document_structure"}`, + `{"tool":"done","answer":"from persisted TOC","cited_pages":[]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.TOC = pageStaticTOC{blob: []byte(`[{"title":"OVERRIDDEN","page_start":1,"page_end":99}]`)} + + _, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + + llm.mu.Lock() + defer llm.mu.Unlock() + if !strings.Contains(llm.lastPrompts[1], "OVERRIDDEN") { + t.Errorf("persisted TOC blob must be surfaced verbatim, got:\n%s", llm.lastPrompts[1]) + } + if strings.Contains(llm.lastPrompts[1], "Install") { + t.Errorf("persisted TOC should win — the synthesised one mustn't leak through") + } +} + +// TestPageIndexBadJSONGraceful: persistent prose responses must +// trigger a retry prompt and then bail cleanly at MaxHops. +func TestPageIndexBadJSONGraceful(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + loopReply: "I think the answer is on page 5.", // never JSON + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + s.MaxHops = 3 + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("want nil error on persistent parse failure, got %v", err) + } + if strings.TrimSpace(res.Reasoning) != "" { + t.Errorf("answer must be empty when no done emitted, got %q", res.Reasoning) + } + if len(res.PagesRead) != 0 { + t.Errorf("PagesRead must be empty when every turn fails to parse, got %v", res.PagesRead) + } +} + +// TestPageIndexClampInvalidRange: a model that asks for pages past +// the document's end gets a recoverable error observation and can +// keep going. The strategy must NOT crash on out-of-range input. +func TestPageIndexClampInvalidRange(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() // max page is 9 + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_pages","start_page":100,"end_page":105}`, // past the end + `{"tool":"done","answer":"recovered","cited_pages":[[1,1]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{"a1_ref": "install"}} + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.Reasoning != "recovered" { + t.Errorf("recovery answer = %q, want %q", res.Reasoning, "recovered") + } + + llm.mu.Lock() + defer llm.mu.Unlock() + // The bad get_pages must surface "invalid range" so the model + // has something to react to. + if !strings.Contains(llm.lastPrompts[1], "invalid range") { + t.Errorf("out-of-range get_pages should produce an 'invalid range' observation; got:\n%s", llm.lastPrompts[1]) + } +} + +// TestPageIndexClampPartialOverlap: a range that overlaps the +// document but extends past the end is silently clamped — the +// model gets useful content (not an error) for the in-range part. +func TestPageIndexClampPartialOverlap(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() // max page is 9 + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_pages","start_page":8,"end_page":50}`, // 8 is valid, 50 is past + `{"tool":"done","answer":"got it","cited_pages":[[8,9]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{"b2_ref": "Debt content."}} + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if len(res.PagesRead) != 1 { + t.Fatalf("PagesRead = %v, want 1 entry", res.PagesRead) + } + if res.PagesRead[0].EndPage != 9 { + t.Errorf("end page should be clamped to 9, got %d", res.PagesRead[0].EndPage) + } +} + +// TestPageIndexEmptyTree exercises the early-return guard. +func TestPageIndexEmptyTree(t *testing.T) { + t.Parallel() + + llm := &pageScriptedLLM{} + s := retrieval.NewPageIndexStrategy(llm) + + res, err := s.SelectWithCost(context.Background(), &tree.Tree{}, "q", retrieval.ContextBudget{}) + if err != nil { + t.Fatal(err) + } + if len(res.SelectedIDs) != 0 { + t.Errorf("empty tree should yield empty selection, got %v", res.SelectedIDs) + } + if atomic.LoadInt32(&llm.calls) != 0 { + t.Errorf("empty tree should make 0 LLM calls, got %d", llm.calls) + } +} + +// TestPageIndexNoLoaderFallback: PageLoader=nil falls back to a +// title+summary rendering of get_pages. The model still gets a +// useful observation so it can keep navigating. +func TestPageIndexNoLoaderFallback(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_pages","start_page":1,"end_page":2}`, + `{"tool":"done","answer":"titles only","cited_pages":[[1,2]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) // no PageLoader + + _, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + + llm.mu.Lock() + defer llm.mu.Unlock() + obs := llm.lastPrompts[1] + if !strings.Contains(obs, "Install") || !strings.Contains(obs, "install steps") { + t.Errorf("loader-less get_pages should fall back to title + summary; got:\n%s", obs) + } +} + +// TestPageIndexContentClippedAtLimit: a get_pages call that would +// produce more chars than PageContentLimit must be clipped. +func TestPageIndexContentClippedAtLimit(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + bigBody := strings.Repeat("X", 5_000) + loader := pageMapLoader{data: map[string]string{ + "a1_ref": bigBody, "a2_ref": bigBody, "b1_ref": bigBody, "b2_ref": bigBody, + }} + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_pages","start_page":1,"end_page":9}`, + `{"tool":"done","answer":"big","cited_pages":[[1,1]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = loader + s.PageContentLimit = 1000 + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.PagesRead[0].CharCount > 1000 { + t.Errorf("get_pages output must respect PageContentLimit=1000, got %d", res.PagesRead[0].CharCount) + } +} + +// TestPageIndexNoCitationsClearsSelection: an empty cited_pages +// list must produce an empty SelectedIDs (no implicit "default to +// everything we visited"). This is the "no useful evidence found" +// path the system prompt prescribes. +func TestPageIndexNoCitationsClearsSelection(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_pages","start_page":1,"end_page":2}`, + `{"tool":"done","answer":"The document does not address this query.","cited_pages":[]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{"a1_ref": "install"}} + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if len(res.SelectedIDs) != 0 { + t.Errorf("empty cited_pages should yield empty SelectedIDs, got %v", res.SelectedIDs) + } + if !strings.Contains(res.Reasoning, "does not address") { + t.Errorf("refusal answer must propagate to Reasoning, got %q", res.Reasoning) + } +} + +// TestPageIndexTraceTokenStable: two runs that emit identical +// cited_pages produce identical trace tokens. Replay's substrate. +func TestPageIndexTraceTokenStable(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + mkRun := func() string { + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"done","answer":"X","cited_pages":[[1,2],[8,9]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{} + res, _ := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{ModelName: "gpt-4o-mini"}) + return res.TraceToken + } + t1 := mkRun() + t2 := mkRun() + if t1 == "" || t1 != t2 { + t.Errorf("trace tokens must be stable across runs; got %q vs %q", t1, t2) + } +} + +// TestPageIndexTraceTokenOrderInvariant: two runs that cite the +// same pages in different orders must produce identical tokens. +func TestPageIndexTraceTokenOrderInvariant(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + mkRun := func(reply string) string { + llm := &pageScriptedLLM{replies: []string{reply}} + s := retrieval.NewPageIndexStrategy(llm) + res, _ := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{ModelName: "gpt-4o-mini"}) + return res.TraceToken + } + t1 := mkRun(`{"tool":"done","answer":"X","cited_pages":[[1,2],[8,9]]}`) + t2 := mkRun(`{"tool":"done","answer":"X","cited_pages":[[8,9],[1,2]]}`) + if t1 != t2 { + t.Errorf("trace tokens must be order-invariant; got %q vs %q", t1, t2) + } +} + +// TestParsePageIndexActionTolerance covers the input shapes the +// parser accepts: +// - "tool" key (canonical) +// - "action" key (alt) +// - "pages":"5-7" string +// - cited_pages as string list ["5-7","10"] +// - markdown fences + prose prefix +// - case-insensitive tool tag +func TestParsePageIndexActionTolerance(t *testing.T) { + t.Parallel() + cases := []struct { + name string + in string + tool string + start int + end int + cited [][2]int + }{ + { + name: "canonical_structure", + in: `{"tool":"get_document_structure","reasoning":"orient"}`, + tool: "get_document_structure", + }, + { + name: "canonical_pages", + in: `{"tool":"get_pages","start_page":5,"end_page":7}`, + tool: "get_pages", + start: 5, end: 7, + }, + { + name: "alt_action_key", + in: `{"action":"get_pages","start_page":5,"end_page":7}`, + tool: "get_pages", + start: 5, end: 7, + }, + { + name: "pages_string_range", + in: `{"tool":"get_pages","pages":"5-7"}`, + tool: "get_pages", + start: 5, end: 7, + }, + { + name: "pages_string_single", + in: `{"tool":"get_pages","pages":"12"}`, + tool: "get_pages", + start: 12, end: 12, + }, + { + name: "code_fence", + in: "```json\n{\"tool\":\"get_pages\",\"start_page\":3,\"end_page\":4}\n```", + tool: "get_pages", + start: 3, end: 4, + }, + { + name: "prose_before", + in: `Sure: {"tool":"get_pages","start_page":1,"end_page":1}`, + tool: "get_pages", + start: 1, end: 1, + }, + { + name: "case_insensitive", + in: `{"tool":"GET_PAGES","start_page":2,"end_page":3}`, + tool: "get_pages", + start: 2, end: 3, + }, + { + name: "done_with_citations", + in: `{"tool":"done","answer":"hi","cited_pages":[[1,2],[5,7]]}`, + tool: "done", + cited: [][2]int{{1, 2}, {5, 7}}, + }, + { + name: "done_with_string_citations", + in: `{"tool":"done","answer":"hi","cited_pages":["1-2","5-7"]}`, + tool: "done", + cited: [][2]int{{1, 2}, {5, 7}}, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := retrieval.ParsePageIndexAction(c.in) + if err != nil { + t.Fatalf("parse: %v", err) + } + if got.Action != c.tool { + t.Errorf("Action = %q, want %q", got.Action, c.tool) + } + if got.StartPage != c.start { + t.Errorf("StartPage = %d, want %d", got.StartPage, c.start) + } + if got.EndPage != c.end { + t.Errorf("EndPage = %d, want %d", got.EndPage, c.end) + } + if len(got.CitedPages) != len(c.cited) { + t.Fatalf("CitedPages len = %d, want %d (got %v)", len(got.CitedPages), len(c.cited), got.CitedPages) + } + for i := range c.cited { + if got.CitedPages[i] != c.cited[i] { + t.Errorf("CitedPages[%d] = %v, want %v", i, got.CitedPages[i], c.cited[i]) + } + } + }) + } +} + +func TestParsePageIndexActionRejectsGarbage(t *testing.T) { + t.Parallel() + for _, in := range []string{ + "", + "I think it's page 5.", + `{"reasoning":"no tool field"}`, + } { + _, err := retrieval.ParsePageIndexAction(in) + if err == nil { + t.Errorf("want error parsing %q", in) + } + } +} + +// indexOfSection is a tiny helper that says "is needle in haystack +// and where". Mirrors slices.Index for readability — keeps the tests +// stdlib-agnostic on older Go versions. +func indexOfSection(haystack []tree.SectionID, needle tree.SectionID) (int, bool) { + for i, id := range haystack { + if id == needle { + return i, true + } + } + return -1, false +} diff --git a/pkg/retrieval/strategy.go b/pkg/retrieval/strategy.go index 8f428f6..a3db98a 100644 --- a/pkg/retrieval/strategy.go +++ b/pkg/retrieval/strategy.go @@ -89,6 +89,27 @@ type Result struct { // regardless of reasoning path. Empty when the strategy did not // populate it (e.g. tests, fallback paths). TraceToken string `json:"trace_token,omitempty"` + + // PagesRead records the page ranges the strategy actually fetched + // during navigation. Page-based strategies (e.g. pageindex) + // populate this; section-by-section strategies leave it nil. + // Useful for the API layer's reasoning-trace surfaces and for + // cost/coverage debugging: a 10-K answer that read pages 50-55 + + // 102-104 leaves a concrete page footprint behind. + PagesRead []PageReadEntry `json:"pages_read,omitempty"` +} + +// PageReadEntry is one get_pages tool call that materialised during a +// page-based retrieval loop. StartPage and EndPage are inclusive, +// 1-indexed. SectionIDs lists every section whose [PageStart,PageEnd] +// overlapped the requested range. CharCount records the size of the +// returned text after PageContentLimit clipping so cost reporting can +// reflect bytes-on-the-wire, not bytes-requested. +type PageReadEntry struct { + StartPage int `json:"start_page"` + EndPage int `json:"end_page"` + SectionIDs []tree.SectionID `json:"section_ids,omitempty"` + CharCount int `json:"char_count,omitempty"` } // Usage is the aggregated token + cost accounting across all LLM calls