Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions cmd/engine/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,28 @@ func run() error {
}
}

// ReRanker: opt-in Phase 2.3. Instantiated whenever an LLM client
// is wired — the per-request `enable_rerank` body field overrides
// the config, mirroring the planner pattern.
var reRanker *retrieval.ReRanker
if llmClient != nil {
reRankModel := cfg.Retrieval.ReRank.Model
if reRankModel == "" {
reRankModel = modelFor(cfg.LLM)
}
reRanker = retrieval.NewReRanker(llmClient, reRankModel)
if cfg.Retrieval.ReRank.MaxContentChars > 0 {
reRanker.MaxContentChars = cfg.Retrieval.ReRank.MaxContentChars
}
if cfg.Retrieval.ReRank.Enabled {
logger.Info("retrieval: rerank enabled",
"model", reRankModel,
"max_content_chars", reRanker.MaxContentChars,
"top_k", cfg.Retrieval.ReRank.TopK,
)
}
}

pipeline := ingest.NewPipeline(ingest.Pipeline{
DB: pool,
Storage: store,
Expand Down Expand Up @@ -157,6 +179,8 @@ func run() error {
Answer: cfg.Retrieval.Answer,
Planner: planner,
Planning: cfg.Retrieval.Planning,
ReRanker: reRanker,
ReRank: cfg.Retrieval.ReRank,
}

srv := &http.Server{
Expand Down
30 changes: 30 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,36 @@ retrieval:
# isolation (plan returned, but retrieval uses the original query).
decompose: true

# rerank: Phase 2.3 content-aware re-rank pass. After the retrieval
# strategy returns candidate sections and their content is loaded,
# one extra LLM call scores each section (0-100) against the query
# and the engine reorders descending by score.
#
# This is the safety net for the case where the strategy reasoned
# over title + summary + HyDE candidate questions and got fooled
# by surface-level matches. Reading the actual content closes that
# gap. ~3-5k input tokens per query on gemini-2.5-flash; ~$0.0003
# per call at typical rates.
#
# OPT-IN. Default disabled. Per-request `enable_rerank` body field
# overrides this block. Failures never drop sections — at worst the
# strategy's order is preserved.
rerank:
enabled: false
# Override the re-rank model; empty inherits the request's model
# (or the engine default). Keep this on a small/fast model — the
# re-rank prompt is short and shouldn't burn the flagship model.
model: ""
# Per-candidate content budget. Higher = more context for the
# model to judge with, lower = tighter cost. 2000 chars ≈ 500
# tokens, comfortable for typical section sizes.
max_content_chars: 2000
# Truncate the post-rerank candidate list to the top K. 0 means
# keep all candidates (re-rank only reorders). Useful when the
# strategy returns a wide candidate list and you want the
# re-rank pass to do the final selection.
top_k: 0

ingest:
# The summarize and HyDE stages run concurrently. This caps the total
# number of LLM calls in flight across both stages combined, so the
Expand Down
169 changes: 168 additions & 1 deletion internal/api/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,16 @@ type Deps struct {
// `enable_planning` field on /v1/query and /v1/answer overrides
// Planning.Enabled.
Planning config.PlanningBlock

// ReRanker runs Phase 2.3 content-aware re-rank on the strategy's
// candidate sections (one extra LLM call per query). Nil disables
// re-rank even when a request opts in via `enable_rerank`.
ReRanker *retrieval.ReRanker

// ReRank carries the server-side re-rank config. The body-level
// `enable_rerank` field on /v1/query and /v1/answer overrides
// ReRank.Enabled. TopK truncates the post-rerank candidate list.
ReRank config.ReRankBlock
}

// Router builds and returns the chi router wired with v1 routes.
Expand Down Expand Up @@ -400,6 +410,10 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
// planner. A pointer so we can distinguish "absent" from
// "explicit false" — absent falls back to the server config.
EnablePlanning *bool `json:"enable_planning"`
// EnableReRank opts this request into the Phase 2.3
// content-aware re-rank pass. Pointer for the same reason as
// EnablePlanning. Overrides retrieval.rerank.enabled.
EnableReRank *bool `json:"enable_rerank"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error())
Expand Down Expand Up @@ -471,6 +485,15 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
enriched = append(enriched, sectionWithContent{sec: sec, content: content})
}

// Optional: content-aware re-rank pass. One LLM call that scores
// each loaded section against the query and re-orders the slice
// descending by score. TopK truncates the survivors. Failures
// never drop sections — at worst the strategy's order is
// preserved (see retrieval.ReRanker.ReRank).
if d.reRankEnabled(body.EnableReRank) {
enriched, _ = d.runReRank(r.Context(), enriched, body.Query, body.Model)
}

// Optional: per-section answer-span extraction. Opt-in via config —
// one LLM call per returned section. Failures are non-fatal; the
// section is returned without a span.
Expand Down Expand Up @@ -499,11 +522,18 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
}

// sectionWithContent bundles a tree section with its loaded content
// and an optional answer-span. Used by /v1/query and /v1/answer.
// and optional re-rank score / answer-span. Used by /v1/query and
// /v1/answer.
type sectionWithContent struct {
sec *tree.Section
content string
span *retrieval.AnswerSpan

// hasScore reports whether score was populated by a re-rank pass.
// Distinct from score == 0 since 0 is a legitimate score the
// model can return.
hasScore bool
score float64
}

// sectionWithContentToMap renders the section as the API map shape.
Expand All @@ -528,6 +558,9 @@ func sectionWithContentToMap(e sectionWithContent) map[string]any {
if e.span != nil {
s["answer_span"] = e.span
}
if e.hasScore {
s["score"] = e.score
}
return s
}

Expand Down Expand Up @@ -620,6 +653,10 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
// EnablePlanning opts this request into the Phase 2.1 query
// planner. See handleQuery for the same field's semantics.
EnablePlanning *bool `json:"enable_planning"`
// EnableReRank opts this request into the Phase 2.3 re-rank
// pass. Synthesis then sees the re-ranked top-k. Overrides
// retrieval.rerank.enabled.
EnableReRank *bool `json:"enable_rerank"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error())
Expand Down Expand Up @@ -699,6 +736,16 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
enriched = append(enriched, sectionWithContent{sec: sec, content: content})
}

// Optional: content-aware re-rank before synthesis sees the
// evidence. When TopK is set the synthesis prompt only ever sees
// the post-rerank top-k, keeping the answer focused on the
// best-evidence sections.
if d.reRankEnabled(body.EnableReRank) {
var reRankUsage retrieval.Usage
enriched, reRankUsage = d.runReRank(r.Context(), enriched, body.Query, body.Model)
totalUsage.Add(reRankUsage)
}

// Always extract spans for /v1/answer — they ground each citation.
spanExtractor := d.spanExtractor(body.Model)
runSpansConcurrent(r.Context(), spanExtractor, body.Query, enriched, d.AnswerSpan.MaxConcurrency, d.Logger)
Expand Down Expand Up @@ -747,6 +794,9 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
c["quote_end"] = e.span.End
}
}
if e.hasScore {
c["score"] = e.score
}
citations = append(citations, c)
}

Expand Down Expand Up @@ -1132,6 +1182,123 @@ func (d Deps) shouldDecompose(plan *retrieval.Plan) bool {
return d.Planning.Decompose
}

// --- re-rank helpers ---

// reRankEnabled reports whether the request should go through the
// re-rank pass. The per-request body field (when present) wins over
// the server-side config; a nil body field falls back to the config.
//
// Returns false when no LLM client is wired or when no ReRanker is
// configured, regardless of intent — re-rank without an LLM is
// physically impossible.
func (d Deps) reRankEnabled(bodyOverride *bool) bool {
if d.ReRanker == nil || d.LLM == nil {
return false
}
if bodyOverride != nil {
return *bodyOverride
}
return d.ReRank.Enabled
}

// runReRank executes the re-rank pass over the loaded section slice
// and returns the reordered slice plus the LLM Usage spent. On any
// failure the original slice is returned (with the same hasScore
// values it had on input — i.e. unchanged) so the caller never has
// to think about partial state. The error is LOGGED, not returned —
// re-rank is best-effort and a failure must never abort the request.
//
// requestModel is the model the request asked for. When the
// ReRanker has its own Model set (the config-level override), that
// wins; the request model is the fall-through.
func (d Deps) runReRank(ctx context.Context, enriched []sectionWithContent, query, requestModel string) ([]sectionWithContent, retrieval.Usage) {
if d.ReRanker == nil || d.LLM == nil || len(enriched) == 0 {
return enriched, retrieval.Usage{}
}

// Apply the model fall-through: config override → request model →
// engine default. We don't mutate d.ReRanker since Deps is shared
// across requests; instead build a shallow copy with the chosen
// model. This is the same pattern spanExtractor() uses.
ranker := *d.ReRanker
if ranker.Model == "" {
if requestModel != "" {
ranker.Model = requestModel
} else {
ranker.Model = d.LLMModel
}
}

candidates := make([]retrieval.SectionContent, len(enriched))
for i, e := range enriched {
candidates[i] = retrieval.SectionContent{
ID: e.sec.ID,
Title: e.sec.Title,
Content: e.content,
}
}

scored, usage, err := ranker.ReRank(ctx, query, candidates)
if err != nil {
if d.Logger != nil {
d.Logger.Warn("rerank: failed; preserving strategy order", "err", err)
}
// ReRank returns input order on error so we *could* apply it
// (it'd just stamp score=0 on everything). Skip — the caller
// shouldn't see score=0 on every section when re-rank
// physically failed.
return enriched, usage
}
if len(scored) == 0 {
return enriched, usage
}

reordered := reorderByScore(enriched, scored)
if d.ReRank.TopK > 0 && len(reordered) > d.ReRank.TopK {
reordered = reordered[:d.ReRank.TopK]
}
return reordered, usage
}

// reorderByScore takes the loaded section slice and the model's
// scored output (already sorted descending by score by the
// ReRanker), and returns a new slice in the same order as scored
// with each entry carrying the per-section score.
//
// Defensive: every input enriched section appears in the output
// exactly once, in the order dictated by scored. If scored is
// missing an input ID (shouldn't happen — ReRank's contract is to
// surface every input ID), that section is appended at the end with
// hasScore=false so the response stays complete.
func reorderByScore(enriched []sectionWithContent, scored []retrieval.ScoredSection) []sectionWithContent {
byID := make(map[tree.SectionID]int, len(enriched))
for i, e := range enriched {
byID[e.sec.ID] = i
}

out := make([]sectionWithContent, 0, len(enriched))
taken := make([]bool, len(enriched))
for _, s := range scored {
idx, ok := byID[s.ID]
if !ok || taken[idx] {
continue
}
taken[idx] = true
e := enriched[idx]
e.hasScore = true
e.score = s.Score
out = append(out, e)
}
// Append anything ReRank didn't surface — invariant says this
// should be empty, but a defence-in-depth check costs nothing.
for i, e := range enriched {
if !taken[i] {
out = append(out, e)
}
}
return out
}

// writePlanHints appends a short, model-readable "Planner notes" block
// describing the structured plan. Synthesis uses this to orient itself
// before reading the evidence.
Expand Down
38 changes: 38 additions & 0 deletions openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,19 @@ components:
retrieval fans out one selection call per sub-question
and unions the results. Overrides the server's
`retrieval.planning.enabled` setting for this request only.
enable_rerank:
type: boolean
description: |
Opt this request into the Phase 2.3 content-aware re-rank
pass. After the retrieval strategy returns candidate
section IDs and the engine loads their content, one extra
LLM call scores each section (0-100) against the query and
sections are reordered descending by score. When
`retrieval.rerank.top_k` is set on the server, only the
top-k sections survive. Failures preserve the strategy's
original order — re-rank never drops sections. Overrides
the server's `retrieval.rerank.enabled` setting for this
request only.

QueryResponse:
type: object
Expand Down Expand Up @@ -513,6 +526,14 @@ components:
description: Full section content from storage.
answer_span:
$ref: "#/components/schemas/AnswerSpan"
score:
type: number
description: |
Re-rank relevance score on a 0-100 scale, populated only
when the request opted into the Phase 2.3 re-rank pass
(`enable_rerank`) or the server has
`retrieval.rerank.enabled=true`. Sections are returned
sorted descending by score. Omitted when no re-rank ran.

AnswerSpan:
type: object
Expand Down Expand Up @@ -560,6 +581,14 @@ components:
full semantics. When enabled, the synthesis prompt also sees
the planner's structured intent and entity hints, and the
response carries a top-level `plan` field.
enable_rerank:
type: boolean
description: |
Opt this request into the Phase 2.3 content-aware re-rank
pass. See QueryRequest.enable_rerank for full semantics.
When the pass runs, the synthesis prompt sees the
re-ranked top-k (capped by `retrieval.rerank.top_k`), and
each citation in the response carries a `score` field.

AnswerResponse:
type: object
Expand Down Expand Up @@ -647,6 +676,9 @@ components:
found one). `quote_start`/`quote_end` give byte offsets into
the source section's content. `page_start`/`page_end` are the
section's page range — omitted for non-paginated formats.
`score` carries the re-rank relevance score on a 0-100 scale,
present only when the request opted into the Phase 2.3
re-rank pass.
properties:
section_id:
type: string
Expand All @@ -662,3 +694,9 @@ components:
type: integer
page_end:
type: integer
score:
type: number
description: |
Re-rank relevance score on a 0-100 scale. Omitted when no
re-rank ran. Higher means the section is more directly
relevant to the query.
Loading
Loading