@@ -1556,54 +1556,74 @@ pub const Explorer = struct {
15561556 self .mu .lockShared ();
15571557 defer self .mu .unlockShared ();
15581558
1559+ if (max_results == 0 ) return try allocator .alloc (SearchResult , 0 );
1560+
15591561 var result_list : std .ArrayList (SearchResult ) = .empty ;
15601562 errdefer result_list .deinit (allocator );
15611563
15621564 // searched tracks which paths have been scanned — shared across all tiers.
15631565 var searched = std .StringHashMap (void ).init (allocator );
15641566 defer searched .deinit ();
15651567
1566- // Tier 0: word index direct lookup — O(1) hash lookup, no content scan.
1567- // Issue #363a: a per-file cap forces diversity so a single hot doc
1568- // file (CHANGELOG.md, architecture.md, etc.) can't saturate the quota
1569- // and crowd out source-file matches that come later in the posting
1570- // list. Cap = max(1, max_results / 5).
1571- // Issue #430: process code-language hits FIRST, then doc-language
1572- // hits. With max_results=50 the per-file cap is 10, so 5 markdown
1573- // files with 10+ mentions each can fill result_list before the
1574- // canonical source file's posting-list entries are reached.
1568+ // Tier 0: word index direct lookup — O(1) hash lookup plus bounded
1569+ // content extraction. A per-file cap forces diversity so a single hot
1570+ // file cannot saturate the quota. Code files are considered before
1571+ // docs, and files with more exact word hits are considered first so
1572+ // popular identifiers and skip-trigram canonical files are not hidden
1573+ // behind earlier low-signal posting-list entries.
15751574 const word_hits = self .word_index .search (query );
1576- if (word_hits .len > 0 and word_hits .len <= max_results * 2 ) {
1577- const tier0_per_file_cap : usize = @max (1 , max_results / 5 );
1578- var tier0_per_file = std .StringHashMap (usize ).init (allocator );
1579- defer tier0_per_file .deinit ();
1580- const passes = [_ ]bool { false , true }; // pass 0 = code, pass 1 = doc
1581- for (passes ) | is_doc_pass | {
1582- if (result_list .items .len >= max_results ) break ;
1583- for (word_hits ) | hit | {
1584- const hit_path = self .word_index .hitPath (hit );
1585- if (hit_path .len == 0 ) continue ;
1586- if (isDocLanguage (detectLanguage (hit_path )) != is_doc_pass ) continue ;
1587- const gop = tier0_per_file .getOrPut (hit_path ) catch continue ;
1588- if (! gop .found_existing ) gop .value_ptr .* = 0 ;
1589- if (gop .value_ptr .* >= tier0_per_file_cap ) continue ;
1590- const ref = self .readContentForSearch (hit_path , allocator ) orelse continue ;
1591- defer ref .deinit ();
1592- const line_text = extractLineByNumber (ref .data , hit .line_num ) orelse continue ;
1593- if (indexOfCaseInsensitive (line_text , query ) == null ) continue ;
1594- const duped_text = try allocator .dupe (u8 , line_text );
1595- errdefer allocator .free (duped_text );
1596- const duped_path = try allocator .dupe (u8 , hit_path );
1597- errdefer allocator .free (duped_path );
1598- try result_list .append (allocator , .{
1599- .path = duped_path ,
1600- .line_num = hit .line_num ,
1601- .line_text = duped_text ,
1602- });
1603- gop .value_ptr .* += 1 ;
1604- searched .put (hit_path , {}) catch {};
1605- if (result_list .items .len >= max_results ) return self .rerankAndFinalize (& result_list , query , allocator );
1575+ if (word_hits .len > 0 ) {
1576+ const Tier0File = struct {
1577+ path : []const u8 ,
1578+ count : u32 ,
1579+ first_seen : usize ,
1580+ };
1581+
1582+ var tier0_files_by_path = std .StringHashMap (Tier0File ).init (allocator );
1583+ defer tier0_files_by_path .deinit ();
1584+
1585+ for (word_hits , 0.. ) | hit , ordinal | {
1586+ const hit_path = self .word_index .hitPath (hit );
1587+ if (hit_path .len == 0 ) continue ;
1588+ const gop = tier0_files_by_path .getOrPut (hit_path ) catch continue ;
1589+ if (! gop .found_existing ) {
1590+ gop .value_ptr .* = .{
1591+ .path = hit_path ,
1592+ .count = 0 ,
1593+ .first_seen = ordinal ,
1594+ };
16061595 }
1596+ gop .value_ptr .count + |= 1 ;
1597+ }
1598+
1599+ var tier0_files : std .ArrayList (Tier0File ) = .empty ;
1600+ defer tier0_files .deinit (allocator );
1601+ try tier0_files .ensureTotalCapacity (allocator , tier0_files_by_path .count ());
1602+ var tier0_iter = tier0_files_by_path .valueIterator ();
1603+ while (tier0_iter .next ()) | stats | {
1604+ tier0_files .appendAssumeCapacity (stats .* );
1605+ }
1606+
1607+ if (tier0_files .items .len > 1 ) {
1608+ std .sort .block (Tier0File , tier0_files .items , {}, struct {
1609+ pub fn lessThan (_ : void , a : Tier0File , b : Tier0File ) bool {
1610+ const a_doc = isDocLanguage (detectLanguage (a .path ));
1611+ const b_doc = isDocLanguage (detectLanguage (b .path ));
1612+ if (a_doc != b_doc ) return ! a_doc ;
1613+ if (a .count != b .count ) return a .count > b .count ;
1614+ if (a .first_seen != b .first_seen ) return a .first_seen < b .first_seen ;
1615+ return std .mem .lessThan (u8 , a .path , b .path );
1616+ }
1617+ }.lessThan );
1618+ }
1619+
1620+ const tier0_per_file_cap : usize = if (tier0_files .items .len <= 1 ) max_results else @max (1 , max_results / 5 );
1621+ for (tier0_files .items ) | stats | {
1622+ if (result_list .items .len >= max_results ) break ;
1623+ const ref = self .readContentForSearch (stats .path , allocator ) orelse continue ;
1624+ defer ref .deinit ();
1625+ searched .put (stats .path , {}) catch {};
1626+ try searchInContent (stats .path , ref .data , query , allocator , tier0_per_file_cap , max_results , & result_list );
16071627 }
16081628 if (result_list .items .len >= max_results )
16091629 return self .rerankAndFinalize (& result_list , query , allocator );
@@ -1634,6 +1654,8 @@ pub const Explorer = struct {
16341654 searched .put (hit_path , {}) catch {};
16351655 if (result_list .items .len >= max_results ) break ;
16361656 }
1657+ if (result_list .items .len >= max_results )
1658+ return self .rerankAndFinalize (& result_list , query , allocator );
16371659 }
16381660
16391661 const candidate_paths = self .trigram_index .candidates (query , allocator );
@@ -1786,7 +1808,7 @@ pub const Explorer = struct {
17861808
17871809 if (self .outlines .get (r .path )) | outline | {
17881810 for (outline .symbols .items ) | sym | {
1789- if (sym .line_start == r .line_num and std . mem . eql ( u8 , sym .name , query )) {
1811+ if (sym .line_start == r .line_num and asciiEqlIgnoreCase ( sym .name , query )) {
17901812 score += 5.0 ;
17911813 break ;
17921814 }
@@ -1796,17 +1818,20 @@ pub const Explorer = struct {
17961818 const basename = std .fs .path .basename (r .path );
17971819 const stem_end = std .mem .indexOfScalar (u8 , basename , '.' ) orelse basename .len ;
17981820 const stem = basename [0.. stem_end ];
1821+ const stem_contains_query = asciiContainsIgnoreCase (stem , query );
1822+ const query_contains_stem = asciiContainsIgnoreCase (query , stem );
1823+ const stem_related_to_query = stem_contains_query or query_contains_stem ;
17991824 if (asciiEqlIgnoreCase (stem , query )) {
18001825 score += 15.0 ;
1801- } else if (asciiContainsIgnoreCase ( stem , query ) ) {
1826+ } else if (stem_related_to_query ) {
18021827 score += 8.0 ;
18031828 }
18041829 // Path-segment match boost: query matches a directory segment in
18051830 // the path (e.g. query="parser" boosts src/parser/foo.zig). Weaker
18061831 // than basename match because the file's own name is a stronger
18071832 // intent signal than the directory it lives in. Skip when basename
18081833 // already matched to avoid double-counting.
1809- if (! asciiContainsIgnoreCase ( stem , query ) and pathHasSegmentIgnoreCase (r .path , query )) {
1834+ if (! stem_related_to_query and pathHasSegmentIgnoreCase (r .path , query )) {
18101835 score += 6.0 ;
18111836 }
18121837
@@ -3849,8 +3874,14 @@ pub const Explorer = struct {
38493874
38503875 /// Search content and annotate results with the enclosing symbol scope.
38513876 pub fn searchContentWithScope (self : * Explorer , query : []const u8 , allocator : std.mem.Allocator , max_results : usize ) ! []const ScopedSearchResult {
3852- self .mu .lockShared ();
3853- defer self .mu .unlockShared ();
3877+ const plain_results = try self .searchContent (query , allocator , max_results );
3878+ defer {
3879+ for (plain_results ) | r | {
3880+ allocator .free (r .line_text );
3881+ allocator .free (r .path );
3882+ }
3883+ allocator .free (plain_results );
3884+ }
38543885
38553886 var result_list : std .ArrayList (ScopedSearchResult ) = .empty ;
38563887 errdefer {
@@ -3861,69 +3892,30 @@ pub const Explorer = struct {
38613892 }
38623893 result_list .deinit (allocator );
38633894 }
3895+ try result_list .ensureTotalCapacity (allocator , plain_results .len );
38643896
3865- const sparse_paths = self .sparse_ngram_index .candidates (query , allocator );
3866- defer if (sparse_paths ) | sp | allocator .free (sp );
3867- const candidate_paths = self .trigram_index .candidates (query , allocator );
3868- defer if (candidate_paths ) | cp | allocator .free (cp );
3897+ self .mu .lockShared ();
3898+ defer self .mu .unlockShared ();
38693899
3870- var searched = std .StringHashMap (void ).init (allocator );
3871- defer searched .deinit ();
3900+ for (plain_results ) | r | {
3901+ const line_text = try allocator .dupe (u8 , r .line_text );
3902+ errdefer allocator .free (line_text );
3903+ const path_copy = try allocator .dupe (u8 , r .path );
3904+ errdefer allocator .free (path_copy );
38723905
3873- if (sparse_paths != null and sparse_paths .? .len > 0 ) {
3874- if (candidate_paths != null and candidate_paths .? .len > 0 ) {
3875- var sparse_set = std .StringHashMap (void ).init (allocator );
3876- defer sparse_set .deinit ();
3877- for (sparse_paths .? ) | p | try sparse_set .put (p , {});
3878- for (candidate_paths .? ) | path | {
3879- if (! sparse_set .contains (path )) continue ;
3880- const ref = self .readContentForSearch (path , allocator ) orelse continue ;
3881- defer ref .deinit ();
3882- try searched .put (path , {});
3883- try self .searchInContentWithScope (path , ref .data , query , allocator , max_results , & result_list );
3884- if (result_list .items .len >= max_results ) break ;
3885- }
3886- } else {
3887- for (sparse_paths .? ) | path | {
3888- const ref = self .readContentForSearch (path , allocator ) orelse continue ;
3889- defer ref .deinit ();
3890- try searched .put (path , {});
3891- try self .searchInContentWithScope (path , ref .data , query , allocator , max_results , & result_list );
3892- if (result_list .items .len >= max_results ) break ;
3893- }
3894- }
3895- } else {
3896- const use_trigram = candidate_paths != null and candidate_paths .? .len > 0 ;
3897- if (use_trigram ) {
3898- for (candidate_paths .? ) | path | {
3899- const ref = self .readContentForSearch (path , allocator ) orelse continue ;
3900- defer ref .deinit ();
3901- try searched .put (path , {});
3902- try self .searchInContentWithScope (path , ref .data , query , allocator , max_results , & result_list );
3903- if (result_list .items .len >= max_results ) break ;
3904- }
3905- } else {
3906- var iter = self .outlines .keyIterator ();
3907- while (iter .next ()) | key_ptr | {
3908- const ref = self .readContentForSearch (key_ptr .* , allocator ) orelse continue ;
3909- defer ref .deinit ();
3910- try self .searchInContentWithScope (key_ptr .* , ref .data , query , allocator , max_results , & result_list );
3911- if (result_list .items .len >= max_results ) break ;
3912- }
3913- return result_list .toOwnedSlice (allocator );
3914- }
3915- }
3906+ const scope = self .findEnclosingSymbolLocked (r .path , r .line_num );
3907+ const scope_name = if (scope ) | s | try allocator .dupe (u8 , s .name ) else null ;
3908+ errdefer if (scope_name ) | n | allocator .free (n );
39163909
3917- if (result_list .items .len < max_results ) {
3918- var iter = self .outlines .keyIterator ();
3919- while (iter .next ()) | key_ptr | {
3920- if (searched .contains (key_ptr .* )) continue ;
3921- if (self .trigram_index .containsFile (key_ptr .* )) continue ;
3922- const ref = self .readContentForSearch (key_ptr .* , allocator ) orelse continue ;
3923- defer ref .deinit ();
3924- try self .searchInContentWithScope (key_ptr .* , ref .data , query , allocator , max_results , & result_list );
3925- if (result_list .items .len >= max_results ) break ;
3926- }
3910+ result_list .appendAssumeCapacity (.{
3911+ .path = path_copy ,
3912+ .line_num = r .line_num ,
3913+ .line_text = line_text ,
3914+ .scope_name = scope_name ,
3915+ .scope_kind = if (scope ) | s | s .kind else null ,
3916+ .scope_start = if (scope ) | s | s .line_start else 0 ,
3917+ .scope_end = if (scope ) | s | s .line_end else 0 ,
3918+ });
39273919 }
39283920
39293921 return result_list .toOwnedSlice (allocator );
@@ -4119,7 +4111,7 @@ pub fn isCommentOrBlank(line: []const u8, language: Language) bool {
41194111}
41204112
41214113fn searchInContent (path : []const u8 , content : []const u8 , query : []const u8 , allocator : std.mem.Allocator , max_per_file : usize , max_results : usize , result_list : * std .ArrayList (SearchResult )) ! void {
4122- if (query .len == 0 or content .len == 0 ) return ;
4114+ if (query .len == 0 or content .len == 0 or max_per_file == 0 or max_results == 0 or result_list . items . len >= max_results ) return ;
41234115 // Issue #431: bail when the query is longer than the file. Without this
41244116 // guard, `content.len - query.len + 1` below underflows usize → integer
41254117 // overflow panic in Debug, SIGBUS in ReleaseFast.
@@ -4141,7 +4133,7 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
41414133 const splat_lo : Vec = @splat (first_lower );
41424134 const splat_hi : Vec = @splat (first_upper );
41434135
4144- while (pos < end ) {
4136+ scan : while (pos < end ) {
41454137 // ── SIMD path: process full 16-byte chunks ──
41464138 if (pos + VW <= end ) {
41474139 const chunk : Vec = content [pos .. ][0.. VW ].* ;
@@ -4155,7 +4147,6 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
41554147 }
41564148
41574149 // Process ALL first-byte candidates in this chunk without reloading.
4158- var found_match = false ;
41594150 while (mask != 0 ) {
41604151 const offset : usize = @ctz (mask );
41614152 const cand = pos + offset ;
@@ -4185,12 +4176,12 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
41854176 current_line += 1 ;
41864177 current_line_start = line_end + 1 ;
41874178 pos = line_end + 1 ;
4188- found_match = true ;
4189- break ; // restart outer loop from new line
4179+ if ( pos >= end ) return ;
4180+ continue : scan ;
41904181 }
41914182 mask &= mask - 1 ; // clear lowest bit, try next candidate in chunk
41924183 }
4193- if ( ! found_match ) pos += VW ; // all candidates were false positives
4184+ pos += VW ; // all candidates were false positives
41944185 continue ;
41954186 }
41964187
0 commit comments