Skip to content

Commit ab8f7cd

Browse files
committed
fix(explore): resolve search quality regressions
1 parent 14b2325 commit ab8f7cd

2 files changed

Lines changed: 243 additions & 109 deletions

File tree

src/explore.zig

Lines changed: 100 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,54 +1556,74 @@ pub const Explorer = struct {
15561556
self.mu.lockShared();
15571557
defer self.mu.unlockShared();
15581558

1559+
if (max_results == 0) return try allocator.alloc(SearchResult, 0);
1560+
15591561
var result_list: std.ArrayList(SearchResult) = .empty;
15601562
errdefer result_list.deinit(allocator);
15611563

15621564
// searched tracks which paths have been scanned — shared across all tiers.
15631565
var searched = std.StringHashMap(void).init(allocator);
15641566
defer searched.deinit();
15651567

1566-
// Tier 0: word index direct lookup — O(1) hash lookup, no content scan.
1567-
// Issue #363a: a per-file cap forces diversity so a single hot doc
1568-
// file (CHANGELOG.md, architecture.md, etc.) can't saturate the quota
1569-
// and crowd out source-file matches that come later in the posting
1570-
// list. Cap = max(1, max_results / 5).
1571-
// Issue #430: process code-language hits FIRST, then doc-language
1572-
// hits. With max_results=50 the per-file cap is 10, so 5 markdown
1573-
// files with 10+ mentions each can fill result_list before the
1574-
// canonical source file's posting-list entries are reached.
1568+
// Tier 0: word index direct lookup — O(1) hash lookup plus bounded
1569+
// content extraction. A per-file cap forces diversity so a single hot
1570+
// file cannot saturate the quota. Code files are considered before
1571+
// docs, and files with more exact word hits are considered first so
1572+
// popular identifiers and skip-trigram canonical files are not hidden
1573+
// behind earlier low-signal posting-list entries.
15751574
const word_hits = self.word_index.search(query);
1576-
if (word_hits.len > 0 and word_hits.len <= max_results * 2) {
1577-
const tier0_per_file_cap: usize = @max(1, max_results / 5);
1578-
var tier0_per_file = std.StringHashMap(usize).init(allocator);
1579-
defer tier0_per_file.deinit();
1580-
const passes = [_]bool{ false, true }; // pass 0 = code, pass 1 = doc
1581-
for (passes) |is_doc_pass| {
1582-
if (result_list.items.len >= max_results) break;
1583-
for (word_hits) |hit| {
1584-
const hit_path = self.word_index.hitPath(hit);
1585-
if (hit_path.len == 0) continue;
1586-
if (isDocLanguage(detectLanguage(hit_path)) != is_doc_pass) continue;
1587-
const gop = tier0_per_file.getOrPut(hit_path) catch continue;
1588-
if (!gop.found_existing) gop.value_ptr.* = 0;
1589-
if (gop.value_ptr.* >= tier0_per_file_cap) continue;
1590-
const ref = self.readContentForSearch(hit_path, allocator) orelse continue;
1591-
defer ref.deinit();
1592-
const line_text = extractLineByNumber(ref.data, hit.line_num) orelse continue;
1593-
if (indexOfCaseInsensitive(line_text, query) == null) continue;
1594-
const duped_text = try allocator.dupe(u8, line_text);
1595-
errdefer allocator.free(duped_text);
1596-
const duped_path = try allocator.dupe(u8, hit_path);
1597-
errdefer allocator.free(duped_path);
1598-
try result_list.append(allocator, .{
1599-
.path = duped_path,
1600-
.line_num = hit.line_num,
1601-
.line_text = duped_text,
1602-
});
1603-
gop.value_ptr.* += 1;
1604-
searched.put(hit_path, {}) catch {};
1605-
if (result_list.items.len >= max_results) return self.rerankAndFinalize(&result_list, query, allocator);
1575+
if (word_hits.len > 0) {
1576+
const Tier0File = struct {
1577+
path: []const u8,
1578+
count: u32,
1579+
first_seen: usize,
1580+
};
1581+
1582+
var tier0_files_by_path = std.StringHashMap(Tier0File).init(allocator);
1583+
defer tier0_files_by_path.deinit();
1584+
1585+
for (word_hits, 0..) |hit, ordinal| {
1586+
const hit_path = self.word_index.hitPath(hit);
1587+
if (hit_path.len == 0) continue;
1588+
const gop = tier0_files_by_path.getOrPut(hit_path) catch continue;
1589+
if (!gop.found_existing) {
1590+
gop.value_ptr.* = .{
1591+
.path = hit_path,
1592+
.count = 0,
1593+
.first_seen = ordinal,
1594+
};
16061595
}
1596+
gop.value_ptr.count +|= 1;
1597+
}
1598+
1599+
var tier0_files: std.ArrayList(Tier0File) = .empty;
1600+
defer tier0_files.deinit(allocator);
1601+
try tier0_files.ensureTotalCapacity(allocator, tier0_files_by_path.count());
1602+
var tier0_iter = tier0_files_by_path.valueIterator();
1603+
while (tier0_iter.next()) |stats| {
1604+
tier0_files.appendAssumeCapacity(stats.*);
1605+
}
1606+
1607+
if (tier0_files.items.len > 1) {
1608+
std.sort.block(Tier0File, tier0_files.items, {}, struct {
1609+
pub fn lessThan(_: void, a: Tier0File, b: Tier0File) bool {
1610+
const a_doc = isDocLanguage(detectLanguage(a.path));
1611+
const b_doc = isDocLanguage(detectLanguage(b.path));
1612+
if (a_doc != b_doc) return !a_doc;
1613+
if (a.count != b.count) return a.count > b.count;
1614+
if (a.first_seen != b.first_seen) return a.first_seen < b.first_seen;
1615+
return std.mem.lessThan(u8, a.path, b.path);
1616+
}
1617+
}.lessThan);
1618+
}
1619+
1620+
const tier0_per_file_cap: usize = if (tier0_files.items.len <= 1) max_results else @max(1, max_results / 5);
1621+
for (tier0_files.items) |stats| {
1622+
if (result_list.items.len >= max_results) break;
1623+
const ref = self.readContentForSearch(stats.path, allocator) orelse continue;
1624+
defer ref.deinit();
1625+
searched.put(stats.path, {}) catch {};
1626+
try searchInContent(stats.path, ref.data, query, allocator, tier0_per_file_cap, max_results, &result_list);
16071627
}
16081628
if (result_list.items.len >= max_results)
16091629
return self.rerankAndFinalize(&result_list, query, allocator);
@@ -1634,6 +1654,8 @@ pub const Explorer = struct {
16341654
searched.put(hit_path, {}) catch {};
16351655
if (result_list.items.len >= max_results) break;
16361656
}
1657+
if (result_list.items.len >= max_results)
1658+
return self.rerankAndFinalize(&result_list, query, allocator);
16371659
}
16381660

16391661
const candidate_paths = self.trigram_index.candidates(query, allocator);
@@ -1786,7 +1808,7 @@ pub const Explorer = struct {
17861808

17871809
if (self.outlines.get(r.path)) |outline| {
17881810
for (outline.symbols.items) |sym| {
1789-
if (sym.line_start == r.line_num and std.mem.eql(u8, sym.name, query)) {
1811+
if (sym.line_start == r.line_num and asciiEqlIgnoreCase(sym.name, query)) {
17901812
score += 5.0;
17911813
break;
17921814
}
@@ -1796,17 +1818,20 @@ pub const Explorer = struct {
17961818
const basename = std.fs.path.basename(r.path);
17971819
const stem_end = std.mem.indexOfScalar(u8, basename, '.') orelse basename.len;
17981820
const stem = basename[0..stem_end];
1821+
const stem_contains_query = asciiContainsIgnoreCase(stem, query);
1822+
const query_contains_stem = asciiContainsIgnoreCase(query, stem);
1823+
const stem_related_to_query = stem_contains_query or query_contains_stem;
17991824
if (asciiEqlIgnoreCase(stem, query)) {
18001825
score += 15.0;
1801-
} else if (asciiContainsIgnoreCase(stem, query)) {
1826+
} else if (stem_related_to_query) {
18021827
score += 8.0;
18031828
}
18041829
// Path-segment match boost: query matches a directory segment in
18051830
// the path (e.g. query="parser" boosts src/parser/foo.zig). Weaker
18061831
// than basename match because the file's own name is a stronger
18071832
// intent signal than the directory it lives in. Skip when basename
18081833
// already matched to avoid double-counting.
1809-
if (!asciiContainsIgnoreCase(stem, query) and pathHasSegmentIgnoreCase(r.path, query)) {
1834+
if (!stem_related_to_query and pathHasSegmentIgnoreCase(r.path, query)) {
18101835
score += 6.0;
18111836
}
18121837

@@ -3849,8 +3874,14 @@ pub const Explorer = struct {
38493874

38503875
/// Search content and annotate results with the enclosing symbol scope.
38513876
pub fn searchContentWithScope(self: *Explorer, query: []const u8, allocator: std.mem.Allocator, max_results: usize) ![]const ScopedSearchResult {
3852-
self.mu.lockShared();
3853-
defer self.mu.unlockShared();
3877+
const plain_results = try self.searchContent(query, allocator, max_results);
3878+
defer {
3879+
for (plain_results) |r| {
3880+
allocator.free(r.line_text);
3881+
allocator.free(r.path);
3882+
}
3883+
allocator.free(plain_results);
3884+
}
38543885

38553886
var result_list: std.ArrayList(ScopedSearchResult) = .empty;
38563887
errdefer {
@@ -3861,69 +3892,30 @@ pub const Explorer = struct {
38613892
}
38623893
result_list.deinit(allocator);
38633894
}
3895+
try result_list.ensureTotalCapacity(allocator, plain_results.len);
38643896

3865-
const sparse_paths = self.sparse_ngram_index.candidates(query, allocator);
3866-
defer if (sparse_paths) |sp| allocator.free(sp);
3867-
const candidate_paths = self.trigram_index.candidates(query, allocator);
3868-
defer if (candidate_paths) |cp| allocator.free(cp);
3897+
self.mu.lockShared();
3898+
defer self.mu.unlockShared();
38693899

3870-
var searched = std.StringHashMap(void).init(allocator);
3871-
defer searched.deinit();
3900+
for (plain_results) |r| {
3901+
const line_text = try allocator.dupe(u8, r.line_text);
3902+
errdefer allocator.free(line_text);
3903+
const path_copy = try allocator.dupe(u8, r.path);
3904+
errdefer allocator.free(path_copy);
38723905

3873-
if (sparse_paths != null and sparse_paths.?.len > 0) {
3874-
if (candidate_paths != null and candidate_paths.?.len > 0) {
3875-
var sparse_set = std.StringHashMap(void).init(allocator);
3876-
defer sparse_set.deinit();
3877-
for (sparse_paths.?) |p| try sparse_set.put(p, {});
3878-
for (candidate_paths.?) |path| {
3879-
if (!sparse_set.contains(path)) continue;
3880-
const ref = self.readContentForSearch(path, allocator) orelse continue;
3881-
defer ref.deinit();
3882-
try searched.put(path, {});
3883-
try self.searchInContentWithScope(path, ref.data, query, allocator, max_results, &result_list);
3884-
if (result_list.items.len >= max_results) break;
3885-
}
3886-
} else {
3887-
for (sparse_paths.?) |path| {
3888-
const ref = self.readContentForSearch(path, allocator) orelse continue;
3889-
defer ref.deinit();
3890-
try searched.put(path, {});
3891-
try self.searchInContentWithScope(path, ref.data, query, allocator, max_results, &result_list);
3892-
if (result_list.items.len >= max_results) break;
3893-
}
3894-
}
3895-
} else {
3896-
const use_trigram = candidate_paths != null and candidate_paths.?.len > 0;
3897-
if (use_trigram) {
3898-
for (candidate_paths.?) |path| {
3899-
const ref = self.readContentForSearch(path, allocator) orelse continue;
3900-
defer ref.deinit();
3901-
try searched.put(path, {});
3902-
try self.searchInContentWithScope(path, ref.data, query, allocator, max_results, &result_list);
3903-
if (result_list.items.len >= max_results) break;
3904-
}
3905-
} else {
3906-
var iter = self.outlines.keyIterator();
3907-
while (iter.next()) |key_ptr| {
3908-
const ref = self.readContentForSearch(key_ptr.*, allocator) orelse continue;
3909-
defer ref.deinit();
3910-
try self.searchInContentWithScope(key_ptr.*, ref.data, query, allocator, max_results, &result_list);
3911-
if (result_list.items.len >= max_results) break;
3912-
}
3913-
return result_list.toOwnedSlice(allocator);
3914-
}
3915-
}
3906+
const scope = self.findEnclosingSymbolLocked(r.path, r.line_num);
3907+
const scope_name = if (scope) |s| try allocator.dupe(u8, s.name) else null;
3908+
errdefer if (scope_name) |n| allocator.free(n);
39163909

3917-
if (result_list.items.len < max_results) {
3918-
var iter = self.outlines.keyIterator();
3919-
while (iter.next()) |key_ptr| {
3920-
if (searched.contains(key_ptr.*)) continue;
3921-
if (self.trigram_index.containsFile(key_ptr.*)) continue;
3922-
const ref = self.readContentForSearch(key_ptr.*, allocator) orelse continue;
3923-
defer ref.deinit();
3924-
try self.searchInContentWithScope(key_ptr.*, ref.data, query, allocator, max_results, &result_list);
3925-
if (result_list.items.len >= max_results) break;
3926-
}
3910+
result_list.appendAssumeCapacity(.{
3911+
.path = path_copy,
3912+
.line_num = r.line_num,
3913+
.line_text = line_text,
3914+
.scope_name = scope_name,
3915+
.scope_kind = if (scope) |s| s.kind else null,
3916+
.scope_start = if (scope) |s| s.line_start else 0,
3917+
.scope_end = if (scope) |s| s.line_end else 0,
3918+
});
39273919
}
39283920

39293921
return result_list.toOwnedSlice(allocator);
@@ -4119,7 +4111,7 @@ pub fn isCommentOrBlank(line: []const u8, language: Language) bool {
41194111
}
41204112

41214113
fn searchInContent(path: []const u8, content: []const u8, query: []const u8, allocator: std.mem.Allocator, max_per_file: usize, max_results: usize, result_list: *std.ArrayList(SearchResult)) !void {
4122-
if (query.len == 0 or content.len == 0) return;
4114+
if (query.len == 0 or content.len == 0 or max_per_file == 0 or max_results == 0 or result_list.items.len >= max_results) return;
41234115
// Issue #431: bail when the query is longer than the file. Without this
41244116
// guard, `content.len - query.len + 1` below underflows usize → integer
41254117
// overflow panic in Debug, SIGBUS in ReleaseFast.
@@ -4141,7 +4133,7 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
41414133
const splat_lo: Vec = @splat(first_lower);
41424134
const splat_hi: Vec = @splat(first_upper);
41434135

4144-
while (pos < end) {
4136+
scan: while (pos < end) {
41454137
// ── SIMD path: process full 16-byte chunks ──
41464138
if (pos + VW <= end) {
41474139
const chunk: Vec = content[pos..][0..VW].*;
@@ -4155,7 +4147,6 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
41554147
}
41564148

41574149
// Process ALL first-byte candidates in this chunk without reloading.
4158-
var found_match = false;
41594150
while (mask != 0) {
41604151
const offset: usize = @ctz(mask);
41614152
const cand = pos + offset;
@@ -4185,12 +4176,12 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
41854176
current_line += 1;
41864177
current_line_start = line_end + 1;
41874178
pos = line_end + 1;
4188-
found_match = true;
4189-
break; // restart outer loop from new line
4179+
if (pos >= end) return;
4180+
continue :scan;
41904181
}
41914182
mask &= mask - 1; // clear lowest bit, try next candidate in chunk
41924183
}
4193-
if (!found_match) pos += VW; // all candidates were false positives
4184+
pos += VW; // all candidates were false positives
41944185
continue;
41954186
}
41964187

0 commit comments

Comments
 (0)