hallelx2 · hallelx2 · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/cmd/engine/main.go b/cmd/engine/main.go
@@ -129,6 +129,28 @@ func run() error {
 		}
 	}
 
+	// ReRanker: opt-in Phase 2.3. Instantiated whenever an LLM client
+	// is wired — the per-request `enable_rerank` body field overrides
+	// the config, mirroring the planner pattern.
+	var reRanker *retrieval.ReRanker
+	if llmClient != nil {
+		reRankModel := cfg.Retrieval.ReRank.Model
+		if reRankModel == "" {
+			reRankModel = modelFor(cfg.LLM)
+		}
+		reRanker = retrieval.NewReRanker(llmClient, reRankModel)
+		if cfg.Retrieval.ReRank.MaxContentChars > 0 {
+			reRanker.MaxContentChars = cfg.Retrieval.ReRank.MaxContentChars
+		}
+		if cfg.Retrieval.ReRank.Enabled {
+			logger.Info("retrieval: rerank enabled",
+				"model", reRankModel,
+				"max_content_chars", reRanker.MaxContentChars,
+				"top_k", cfg.Retrieval.ReRank.TopK,
+			)
+		}
+	}
+
 	pipeline := ingest.NewPipeline(ingest.Pipeline{
 		DB:                   pool,
 		Storage:              store,
@@ -157,6 +179,8 @@ func run() error {
 		Answer:     cfg.Retrieval.Answer,
 		Planner:    planner,
 		Planning:   cfg.Retrieval.Planning,
+		ReRanker:   reRanker,
+		ReRank:     cfg.Retrieval.ReRank,
 	}
 
 	srv := &http.Server{

diff --git a/config.example.yaml b/config.example.yaml
@@ -152,6 +152,36 @@ retrieval:
     # isolation (plan returned, but retrieval uses the original query).
     decompose: true
 
+  # rerank: Phase 2.3 content-aware re-rank pass. After the retrieval
+  # strategy returns candidate sections and their content is loaded,
+  # one extra LLM call scores each section (0-100) against the query
+  # and the engine reorders descending by score.
+  #
+  # This is the safety net for the case where the strategy reasoned
+  # over title + summary + HyDE candidate questions and got fooled
+  # by surface-level matches. Reading the actual content closes that
+  # gap. ~3-5k input tokens per query on gemini-2.5-flash; ~$0.0003
+  # per call at typical rates.
+  #
+  # OPT-IN. Default disabled. Per-request `enable_rerank` body field
+  # overrides this block. Failures never drop sections — at worst the
+  # strategy's order is preserved.
+  rerank:
+    enabled: false
+    # Override the re-rank model; empty inherits the request's model
+    # (or the engine default). Keep this on a small/fast model — the
+    # re-rank prompt is short and shouldn't burn the flagship model.
+    model: ""
+    # Per-candidate content budget. Higher = more context for the
+    # model to judge with, lower = tighter cost. 2000 chars ≈ 500
+    # tokens, comfortable for typical section sizes.
+    max_content_chars: 2000
+    # Truncate the post-rerank candidate list to the top K. 0 means
+    # keep all candidates (re-rank only reorders). Useful when the
+    # strategy returns a wide candidate list and you want the
+    # re-rank pass to do the final selection.
+    top_k: 0
+
 ingest:
   # The summarize and HyDE stages run concurrently. This caps the total
   # number of LLM calls in flight across both stages combined, so the

diff --git a/internal/api/server.go b/internal/api/server.go
@@ -73,6 +73,16 @@ type Deps struct {
 	// `enable_planning` field on /v1/query and /v1/answer overrides
 	// Planning.Enabled.
 	Planning config.PlanningBlock
+
+	// ReRanker runs Phase 2.3 content-aware re-rank on the strategy's
+	// candidate sections (one extra LLM call per query). Nil disables
+	// re-rank even when a request opts in via `enable_rerank`.
+	ReRanker *retrieval.ReRanker
+
+	// ReRank carries the server-side re-rank config. The body-level
+	// `enable_rerank` field on /v1/query and /v1/answer overrides
+	// ReRank.Enabled. TopK truncates the post-rerank candidate list.
+	ReRank config.ReRankBlock
 }
 
 // Router builds and returns the chi router wired with v1 routes.
@@ -400,6 +410,10 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 		// planner. A pointer so we can distinguish "absent" from
 		// "explicit false" — absent falls back to the server config.
 		EnablePlanning *bool `json:"enable_planning"`
+		// EnableReRank opts this request into the Phase 2.3
+		// content-aware re-rank pass. Pointer for the same reason as
+		// EnablePlanning. Overrides retrieval.rerank.enabled.
+		EnableReRank *bool `json:"enable_rerank"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
 		writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error())
@@ -471,6 +485,15 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 		enriched = append(enriched, sectionWithContent{sec: sec, content: content})
 	}
 
+	// Optional: content-aware re-rank pass. One LLM call that scores
+	// each loaded section against the query and re-orders the slice
+	// descending by score. TopK truncates the survivors. Failures
+	// never drop sections — at worst the strategy's order is
+	// preserved (see retrieval.ReRanker.ReRank).
+	if d.reRankEnabled(body.EnableReRank) {
+		enriched, _ = d.runReRank(r.Context(), enriched, body.Query, body.Model)
+	}
+
 	// Optional: per-section answer-span extraction. Opt-in via config —
 	// one LLM call per returned section. Failures are non-fatal; the
 	// section is returned without a span.
@@ -499,11 +522,18 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 }
 
 // sectionWithContent bundles a tree section with its loaded content
-// and an optional answer-span. Used by /v1/query and /v1/answer.
+// and optional re-rank score / answer-span. Used by /v1/query and
+// /v1/answer.
 type sectionWithContent struct {
 	sec     *tree.Section
 	content string
 	span    *retrieval.AnswerSpan
+
+	// hasScore reports whether score was populated by a re-rank pass.
+	// Distinct from score == 0 since 0 is a legitimate score the
+	// model can return.
+	hasScore bool
+	score    float64
 }
 
 // sectionWithContentToMap renders the section as the API map shape.
@@ -528,6 +558,9 @@ func sectionWithContentToMap(e sectionWithContent) map[string]any {
 	if e.span != nil {
 		s["answer_span"] = e.span
 	}
+	if e.hasScore {
+		s["score"] = e.score
+	}
 	return s
 }
 
@@ -620,6 +653,10 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
 		// EnablePlanning opts this request into the Phase 2.1 query
 		// planner. See handleQuery for the same field's semantics.
 		EnablePlanning *bool `json:"enable_planning"`
+		// EnableReRank opts this request into the Phase 2.3 re-rank
+		// pass. Synthesis then sees the re-ranked top-k. Overrides
+		// retrieval.rerank.enabled.
+		EnableReRank *bool `json:"enable_rerank"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
 		writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error())
@@ -699,6 +736,16 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
 		enriched = append(enriched, sectionWithContent{sec: sec, content: content})
 	}
 
+	// Optional: content-aware re-rank before synthesis sees the
+	// evidence. When TopK is set the synthesis prompt only ever sees
+	// the post-rerank top-k, keeping the answer focused on the
+	// best-evidence sections.
+	if d.reRankEnabled(body.EnableReRank) {
+		var reRankUsage retrieval.Usage
+		enriched, reRankUsage = d.runReRank(r.Context(), enriched, body.Query, body.Model)
+		totalUsage.Add(reRankUsage)
+	}
+
 	// Always extract spans for /v1/answer — they ground each citation.
 	spanExtractor := d.spanExtractor(body.Model)
 	runSpansConcurrent(r.Context(), spanExtractor, body.Query, enriched, d.AnswerSpan.MaxConcurrency, d.Logger)
@@ -747,6 +794,9 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
 				c["quote_end"] = e.span.End
 			}
 		}
+		if e.hasScore {
+			c["score"] = e.score
+		}
 		citations = append(citations, c)
 	}
 
@@ -1132,6 +1182,123 @@ func (d Deps) shouldDecompose(plan *retrieval.Plan) bool {
 	return d.Planning.Decompose
 }
 
+// --- re-rank helpers ---
+
+// reRankEnabled reports whether the request should go through the
+// re-rank pass. The per-request body field (when present) wins over
+// the server-side config; a nil body field falls back to the config.
+//
+// Returns false when no LLM client is wired or when no ReRanker is
+// configured, regardless of intent — re-rank without an LLM is
+// physically impossible.
+func (d Deps) reRankEnabled(bodyOverride *bool) bool {
+	if d.ReRanker == nil || d.LLM == nil {
+		return false
+	}
+	if bodyOverride != nil {
+		return *bodyOverride
+	}
+	return d.ReRank.Enabled
+}
+
+// runReRank executes the re-rank pass over the loaded section slice
+// and returns the reordered slice plus the LLM Usage spent. On any
+// failure the original slice is returned (with the same hasScore
+// values it had on input — i.e. unchanged) so the caller never has
+// to think about partial state. The error is LOGGED, not returned —
+// re-rank is best-effort and a failure must never abort the request.
+//
+// requestModel is the model the request asked for. When the
+// ReRanker has its own Model set (the config-level override), that
+// wins; the request model is the fall-through.
+func (d Deps) runReRank(ctx context.Context, enriched []sectionWithContent, query, requestModel string) ([]sectionWithContent, retrieval.Usage) {
+	if d.ReRanker == nil || d.LLM == nil || len(enriched) == 0 {
+		return enriched, retrieval.Usage{}
+	}
+
+	// Apply the model fall-through: config override → request model →
+	// engine default. We don't mutate d.ReRanker since Deps is shared
+	// across requests; instead build a shallow copy with the chosen
+	// model. This is the same pattern spanExtractor() uses.
+	ranker := *d.ReRanker
+	if ranker.Model == "" {
+		if requestModel != "" {
+			ranker.Model = requestModel
+		} else {
+			ranker.Model = d.LLMModel
+		}
+	}
+
+	candidates := make([]retrieval.SectionContent, len(enriched))
+	for i, e := range enriched {
+		candidates[i] = retrieval.SectionContent{
+			ID:      e.sec.ID,
+			Title:   e.sec.Title,
+			Content: e.content,
+		}
+	}
+
+	scored, usage, err := ranker.ReRank(ctx, query, candidates)
+	if err != nil {
+		if d.Logger != nil {
+			d.Logger.Warn("rerank: failed; preserving strategy order", "err", err)
+		}
+		// ReRank returns input order on error so we *could* apply it
+		// (it'd just stamp score=0 on everything). Skip — the caller
+		// shouldn't see score=0 on every section when re-rank
+		// physically failed.
+		return enriched, usage
+	}
+	if len(scored) == 0 {
+		return enriched, usage
+	}
+
+	reordered := reorderByScore(enriched, scored)
+	if d.ReRank.TopK > 0 && len(reordered) > d.ReRank.TopK {
+		reordered = reordered[:d.ReRank.TopK]
+	}
+	return reordered, usage
+}
+
+// reorderByScore takes the loaded section slice and the model's
+// scored output (already sorted descending by score by the
+// ReRanker), and returns a new slice in the same order as scored
+// with each entry carrying the per-section score.
+//
+// Defensive: every input enriched section appears in the output
+// exactly once, in the order dictated by scored. If scored is
+// missing an input ID (shouldn't happen — ReRank's contract is to
+// surface every input ID), that section is appended at the end with
+// hasScore=false so the response stays complete.
+func reorderByScore(enriched []sectionWithContent, scored []retrieval.ScoredSection) []sectionWithContent {
+	byID := make(map[tree.SectionID]int, len(enriched))
+	for i, e := range enriched {
+		byID[e.sec.ID] = i
+	}
+
+	out := make([]sectionWithContent, 0, len(enriched))
+	taken := make([]bool, len(enriched))
+	for _, s := range scored {
+		idx, ok := byID[s.ID]
+		if !ok || taken[idx] {
+			continue
+		}
+		taken[idx] = true
+		e := enriched[idx]
+		e.hasScore = true
+		e.score = s.Score
+		out = append(out, e)
+	}
+	// Append anything ReRank didn't surface — invariant says this
+	// should be empty, but a defence-in-depth check costs nothing.
+	for i, e := range enriched {
+		if !taken[i] {
+			out = append(out, e)
+		}
+	}
+	return out
+}
+
 // writePlanHints appends a short, model-readable "Planner notes" block
 // describing the structured plan. Synthesis uses this to orient itself
 // before reading the evidence.

diff --git a/openapi.yaml b/openapi.yaml
@@ -462,6 +462,19 @@ components:
             retrieval fans out one selection call per sub-question
             and unions the results. Overrides the server's
             `retrieval.planning.enabled` setting for this request only.
+        enable_rerank:
+          type: boolean
+          description: |
+            Opt this request into the Phase 2.3 content-aware re-rank
+            pass. After the retrieval strategy returns candidate
+            section IDs and the engine loads their content, one extra
+            LLM call scores each section (0-100) against the query and
+            sections are reordered descending by score. When
+            `retrieval.rerank.top_k` is set on the server, only the
+            top-k sections survive. Failures preserve the strategy's
+            original order — re-rank never drops sections. Overrides
+            the server's `retrieval.rerank.enabled` setting for this
+            request only.
 
     QueryResponse:
       type: object
@@ -513,6 +526,14 @@ components:
           description: Full section content from storage.
         answer_span:
           $ref: "#/components/schemas/AnswerSpan"
+        score:
+          type: number
+          description: |
+            Re-rank relevance score on a 0-100 scale, populated only
+            when the request opted into the Phase 2.3 re-rank pass
+            (`enable_rerank`) or the server has
+            `retrieval.rerank.enabled=true`. Sections are returned
+            sorted descending by score. Omitted when no re-rank ran.
 
     AnswerSpan:
       type: object
@@ -560,6 +581,14 @@ components:
             full semantics. When enabled, the synthesis prompt also sees
             the planner's structured intent and entity hints, and the
             response carries a top-level `plan` field.
+        enable_rerank:
+          type: boolean
+          description: |
+            Opt this request into the Phase 2.3 content-aware re-rank
+            pass. See QueryRequest.enable_rerank for full semantics.
+            When the pass runs, the synthesis prompt sees the
+            re-ranked top-k (capped by `retrieval.rerank.top_k`), and
+            each citation in the response carries a `score` field.
 
     AnswerResponse:
       type: object
@@ -647,6 +676,9 @@ components:
         found one). `quote_start`/`quote_end` give byte offsets into
         the source section's content. `page_start`/`page_end` are the
         section's page range — omitted for non-paginated formats.
+        `score` carries the re-rank relevance score on a 0-100 scale,
+        present only when the request opted into the Phase 2.3
+        re-rank pass.
       properties:
         section_id:
           type: string
@@ -662,3 +694,9 @@ components:
           type: integer
         page_end:
           type: integer
+        score:
+          type: number
+          description: |
+            Re-rank relevance score on a 0-100 scale. Omitted when no
+            re-rank ran. Higher means the section is more directly
+            relevant to the query.