Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ pub const Explorer = struct {
word_index: WordIndex,
trigram_index: AnyTrigramIndex,
sparse_ngram_index: SparseNgramIndex,
/// Paths indexed with skip_trigram=true (past 15k cap or excluded).
/// Used to restrict the searchContent fallback to only these files.
skip_trigram_files: std.StringHashMap(void),
allocator: std.mem.Allocator,
word_index_complete: bool = true,
word_index_can_load_from_disk: bool = false,
Expand All @@ -152,6 +155,7 @@ pub const Explorer = struct {
.word_index = WordIndex.init(allocator),
.trigram_index = .{ .heap = TrigramIndex.init(allocator) },
.sparse_ngram_index = SparseNgramIndex.init(allocator),
.skip_trigram_files = std.StringHashMap(void).init(allocator),
.allocator = allocator,
};
}
Expand Down Expand Up @@ -179,6 +183,7 @@ pub const Explorer = struct {
self.word_index.deinit();
self.trigram_index.deinit();
self.sparse_ngram_index.deinit();
self.skip_trigram_files.deinit();
if (self.root_dir) |*d| d.close();
}

Expand Down Expand Up @@ -270,15 +275,24 @@ pub const Explorer = struct {
self.word_index_can_load_from_disk = false;
}
try self.word_index.indexFile(stable_path, content);
// If trigram indexing fails below, restore word_index to its previous state
// to prevent word_index and trigram_index from diverging.
errdefer if (prior_content) |old| {
self.word_index.indexFile(stable_path, old) catch {};
} else {
self.word_index.removeFile(stable_path);
};
if (self.word_index_complete) {
self.word_index_generation +%= 1;
}
if (!skip_trigram) {
try self.trigram_index.indexFile(stable_path, content);
try self.sparse_ngram_index.indexFile(stable_path, content);
_ = self.skip_trigram_files.remove(stable_path);
} else {
self.trigram_index.removeFile(stable_path);
self.sparse_ngram_index.removeFile(stable_path);
try self.skip_trigram_files.put(stable_path, {});
Comment on lines 293 to +295
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Remove skip-trigram entries when files are deleted

The new skip-trigram tracking inserts paths into skip_trigram_files, but there is no corresponding removal on file deletion (Explorer.removeFile). That means deleted paths remain in this map permanently, and searchContent’s fallback loop keeps iterating stale entries and attempting file reads for missing files. In long-running sessions with high file churn, this regresses the intended optimization and causes unbounded growth in fallback work.

Useful? React with 👍 / 👎.

}
}

Expand Down Expand Up @@ -792,10 +806,9 @@ pub fn parseContentForIndexing(allocator: std.mem.Allocator, path: []const u8, c
}

if (result_list.items.len < max_results) {
var iter = self.outlines.keyIterator();
var iter = self.skip_trigram_files.keyIterator();
while (iter.next()) |key_ptr| {
if (searched.contains(key_ptr.*)) continue;
if (self.trigram_index.containsFile(key_ptr.*)) continue;
const ref = self.readContentForSearch(key_ptr.*, allocator) orelse continue;
defer ref.deinit();
try searchInContent(key_ptr.*, ref.data, query, allocator, max_results, &result_list);
Expand Down
72 changes: 55 additions & 17 deletions src/index.zig
Original file line number Diff line number Diff line change
Expand Up @@ -497,12 +497,17 @@ pub const PostingList = struct {
}

pub fn removeDocId(self: *PostingList, doc_id: u32) void {
var i: usize = 0;
while (i < self.items.items.len) {
if (self.items.items[i].doc_id == doc_id) {
_ = self.items.orderedRemove(i);
var lo: usize = 0;
var hi: usize = self.items.items.len;
while (lo < hi) {
const mid = lo + (hi - lo) / 2;
if (self.items.items[mid].doc_id < doc_id) {
lo = mid + 1;
} else if (self.items.items[mid].doc_id > doc_id) {
hi = mid;
} else {
i += 1;
_ = self.items.orderedRemove(mid);
return;
}
}
}
Expand All @@ -515,8 +520,10 @@ pub const TrigramIndex = struct {
file_trigrams: std.StringHashMap(std.ArrayList(Trigram)),
/// path → doc_id mapping
path_to_id: std.StringHashMap(u32),
/// doc_id → path mapping
/// doc_id → path mapping (may contain "" sentinels for freed slots)
id_to_path: std.ArrayList([]const u8),
/// freed doc_id slots available for reuse by getOrCreateDocId
free_ids: std.ArrayList(u32),
allocator: std.mem.Allocator,
/// When true, deinit frees the path keys in file_trigrams (set by readFromDisk).
owns_paths: bool = false,
Expand All @@ -527,6 +534,7 @@ pub const TrigramIndex = struct {
.file_trigrams = std.StringHashMap(std.ArrayList(Trigram)).init(allocator),
.path_to_id = std.StringHashMap(u32).init(allocator),
.id_to_path = .{},
.free_ids = .{},
.allocator = allocator,
};
}
Expand All @@ -547,12 +555,20 @@ pub const TrigramIndex = struct {

self.path_to_id.deinit();
self.id_to_path.deinit(self.allocator);
self.free_ids.deinit(self.allocator);
}

fn getOrCreateDocId(self: *TrigramIndex, path: []const u8) !u32 {
if (self.path_to_id.get(path)) |id| return id;
const id: u32 = @intCast(self.id_to_path.items.len);
try self.id_to_path.append(self.allocator, path);
const id: u32 = if (self.free_ids.items.len > 0) blk: {
const freed: u32 = self.free_ids.pop() orelse unreachable;
self.id_to_path.items[@as(usize, freed)] = path;
break :blk freed;
} else blk: {
const new_id: u32 = @intCast(self.id_to_path.items.len);
try self.id_to_path.append(self.allocator, path);
break :blk new_id;
};
try self.path_to_id.put(path, id);
return id;
}
Expand All @@ -564,6 +580,11 @@ pub const TrigramIndex = struct {
_ = self.file_trigrams.remove(path);
return;
};
// Always clean path_to_id first, regardless of whether file_trigrams has an entry.
_ = self.path_to_id.remove(path);
// Free the doc_id slot for reuse on next indexFile call.
self.free_ids.append(self.allocator, doc_id) catch {};
self.id_to_path.items[doc_id] = "";
const trigrams = self.file_trigrams.getPtr(path) orelse return;
for (trigrams.items) |tri| {
if (self.index.getPtr(tri)) |posting_list| {
Expand All @@ -576,13 +597,17 @@ pub const TrigramIndex = struct {
}
trigrams.deinit(self.allocator);
_ = self.file_trigrams.remove(path);
_ = self.path_to_id.remove(path);
}

pub fn indexFile(self: *TrigramIndex, path: []const u8, content: []const u8) !void {
const id_count_before = self.id_to_path.items.len;
self.removeFile(path);

const doc_id = try self.getOrCreateDocId(path);
// If id_to_path grew, this is a brand-new file (doc_id == max), so append
// maintains sorted PostingList order. If it did not grow, a freed slot was
// reused and we must use sorted insert to preserve the invariant.
const is_new_doc = self.id_to_path.items.len > id_count_before;

// Phase 1: accumulate masks locally per trigram (no global index writes)
var local = std.AutoHashMap(Trigram, PostingMask).init(self.allocator);
Expand Down Expand Up @@ -630,12 +655,19 @@ pub const TrigramIndex = struct {
if (!idx_gop.found_existing) {
idx_gop.value_ptr.* = .{ .path_to_id = &self.path_to_id };
}
// Single append (not sorted insert) since doc_id is monotonically increasing
try idx_gop.value_ptr.items.append(self.allocator, .{
.doc_id = doc_id,
.next_mask = mask.next_mask,
.loc_mask = mask.loc_mask,
});
if (is_new_doc) {
// New doc_id is always max: append maintains sorted PostingList order.
try idx_gop.value_ptr.items.append(self.allocator, .{
.doc_id = doc_id,
.next_mask = mask.next_mask,
.loc_mask = mask.loc_mask,
});
} else {
// Reused doc_id: sorted insert to maintain PostingList binary-search invariant.
const posting = try idx_gop.value_ptr.getOrAddPosting(self.allocator, doc_id);
posting.next_mask = mask.next_mask;
posting.loc_mask = mask.loc_mask;
}

try tri_list.append(self.allocator, tri);
}
Expand Down Expand Up @@ -1649,7 +1681,10 @@ pub const AnyTrigramIndex = union(enum) {
result.ensureTotalCapacity(allocator, merged.count()) catch break :blk null;
var it = merged.keyIterator();
while (it.next()) |k| result.appendAssumeCapacity(k.*);
break :blk result.toOwnedSlice(allocator) catch null;
break :blk result.toOwnedSlice(allocator) catch {
result.deinit(allocator);
break :blk null;
};
},
};
}
Expand Down Expand Up @@ -1682,7 +1717,10 @@ pub const AnyTrigramIndex = union(enum) {
result.ensureTotalCapacity(allocator, merged.count()) catch break :blk null;
var it = merged.keyIterator();
while (it.next()) |k| result.appendAssumeCapacity(k.*);
break :blk result.toOwnedSlice(allocator) catch null;
break :blk result.toOwnedSlice(allocator) catch {
result.deinit(allocator);
break :blk null;
};
},
};
}
Expand Down
1 change: 1 addition & 0 deletions src/lib.zig
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,5 @@ pub const applyEdit = @import("edit.zig").applyEdit;

pub const watcher = @import("watcher.zig");
pub const mcp = @import("mcp.zig");
pub const snapshot = @import("snapshot.zig");
pub const snapshot_json = @import("snapshot_json.zig");
61 changes: 15 additions & 46 deletions src/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ const snapshot_mod = @import("snapshot.zig");
const telemetry = @import("telemetry.zig");
const root_policy = @import("root_policy.zig");
const nuke_mod = @import("nuke.zig");
const update_mod = @import("update.zig");
const release_info = @import("release_info.zig");

/// Thin wrapper: format + write to a File via allocator.
const Out = struct {
Expand Down Expand Up @@ -100,7 +102,7 @@ fn mainImpl() !void {

// Handle --version early (no root needed)
if (std.mem.eql(u8, cmd, "--version") or std.mem.eql(u8, cmd, "-v") or std.mem.eql(u8, cmd, "version")) {
out.p("codedb 0.2.56\n", .{});
out.p("codedb {s}\n", .{release_info.semver});
return;
}

Expand All @@ -110,48 +112,9 @@ fn mainImpl() !void {
return;
}

// Handle update command — direct binary download from GitHub releases.
// The CDN install script has issues with set -euo pipefail on macOS,
// so we download the binary directly and replace in-place.
// Handle update command early — before root resolution so it works from anywhere.
if (std.mem.eql(u8, cmd, "update")) {
out.p("updating codedb...\n", .{});
var child = std.process.Child.init(
&.{
"/bin/bash", "-c",
\\set -e
\\PLATFORM="$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m)"
\\case "$PLATFORM" in
\\ darwin-arm64) BIN="codedb-darwin-arm64" ;;
\\ darwin-x86_64) BIN="codedb-darwin-x86_64" ;;
\\ linux-x86_64) BIN="codedb-linux-x86_64" ;;
\\ linux-aarch64) BIN="codedb-linux-aarch64" ;;
\\ *) echo "unsupported platform: $PLATFORM" >&2; exit 1 ;;
\\esac
\\VERSION=$(curl -fsSL https://api.github.com/repos/justrach/codedb/releases/latest 2>/dev/null | grep -oE '"tag_name"\s*:\s*"v[^"]*"' | cut -d'"' -f4 | sed 's/^v//')
\\if [ -z "$VERSION" ]; then
\\ VERSION=$(curl -fsSL https://codedb.codegraff.com/latest.json | grep -oE '"version"\s*:\s*"[^"]*"' | cut -d'"' -f4)
\\fi
\\if [ -z "$VERSION" ]; then
\\ echo "failed to determine latest version" >&2
\\ exit 1
\\fi
\\echo " latest: v${VERSION}"
\\TMP=$(mktemp)
\\curl -fsSL "https://github.com/justrach/codedb/releases/download/v${VERSION}/${BIN}" -o "$TMP"
\\SELF=$(which codedb 2>/dev/null || echo "$HOME/bin/codedb")
\\chmod +x "$TMP"
\\mv -f "$TMP" "$SELF"
\\echo " updated: $($SELF --version)"
},
allocator,
);
child.stdin_behavior = .Inherit;
child.stdout_behavior = .Inherit;
child.stderr_behavior = .Inherit;
_ = child.spawnAndWait() catch {
out.p("update failed\n", .{});
std.process.exit(1);
};
update_mod.run(stdout, s, allocator);
return;
}

Expand Down Expand Up @@ -732,10 +695,6 @@ fn printUsage(out: Out, s: sty.Style) void {
\\ {s}find{s} {s}<name>{s} find where a symbol is defined
\\ {s}search{s} {s}<query>{s} full-text search (trigram, case-insensitive)
\\ {s}word{s} {s}<identifier>{s} exact word lookup via inverted index
\\ {s}hot{s} recently modified files
\\ {s}serve{s} HTTP daemon on :7719
\\ {s}mcp{s} JSON-RPC/MCP server over stdio
\\ {s}nuke{s} uninstall codedb, clear caches, and deregister integrations
\\
, .{
s.bold, s.reset,
Expand All @@ -750,6 +709,16 @@ fn printUsage(out: Out, s: sty.Style) void {
s.dim, s.reset,
s.cyan, s.reset,
s.dim, s.reset,
});
out.p(
\\ {s}hot{s} recently modified files
\\ {s}serve{s} HTTP daemon on :7719
\\ {s}mcp{s} JSON-RPC/MCP server over stdio
\\ {s}update{s} self-update to the latest verified release
\\ {s}nuke{s} uninstall codedb, clear caches, and deregister integrations
\\
, .{
s.cyan, s.reset,
s.cyan, s.reset,
s.cyan, s.reset,
s.cyan, s.reset,
Expand Down
9 changes: 6 additions & 3 deletions src/mcp.zig
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const snapshot_mod = @import("snapshot.zig");
const telemetry_mod = @import("telemetry.zig");
const git_mod = @import("git.zig");
const root_policy = @import("root_policy.zig");
const release_info = @import("release_info.zig");
// ── Project cache ────────────────────────────────────────────────────────────

const ProjectCtx = struct {
Expand Down Expand Up @@ -493,9 +494,11 @@ fn handleInitialize(s: *Session, root: *const std.json.ObjectMap, id: ?std.json.
s.client_name = name;
}
}
writeResult(s.alloc, s.stdout, id,
\\{"protocolVersion":"2025-06-18","capabilities":{"tools":{"listChanged":false}},"serverInfo":{"name":"codedb","version":"0.2.56"}}
);
const init_result = std.fmt.allocPrint(s.alloc,
\\{{"protocolVersion":"2025-06-18","capabilities":{{"tools":{{"listChanged":false}}}},"serverInfo":{{"name":"codedb","version":"{s}"}}}}
, .{release_info.semver}) catch return;
defer s.alloc.free(init_result);
writeResult(s.alloc, s.stdout, id, init_result);
}

fn requestRoots(s: *Session) void {
Expand Down
Loading
Loading