Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 27 additions & 25 deletions crate/src/merge.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use super::types::Chunk;
use super::utils::{header_is_superset_of, set_length};

const HASHES: &str = "######";

fn should_merge(a_length: usize, b_length: usize, min_length: usize, max_length: usize) -> bool {
if a_length >= min_length && b_length >= min_length {
return false;
Expand All @@ -26,7 +28,9 @@ pub fn merge_phase2(chunks: Vec<Chunk>, min_length: usize, max_length: usize) ->
if prev.breadcrumb == chunk.breadcrumb
&& should_merge(prev.length, chunk.length, min_length, max_length)
{
prev.text = format!("{}\n\n{}", prev.text, chunk.text);
prev.text.reserve(chunk.text.len() + 2);
prev.text.push_str("\n\n");
prev.text.push_str(&chunk.text);
set_length(&mut prev);
current = Some(prev);
continue;
Expand Down Expand Up @@ -58,42 +62,40 @@ pub fn merge_phase3(chunks: Vec<Chunk>, min_length: usize, max_length: usize) ->

for level in (1..=6).rev() {
let mut merged = Vec::with_capacity(result.len());
let mut i = 0;

while i < result.len() {
if result[i].level == level && result[i].length < max_length {
let mut current = result[i].clone();
let parent_headers = current.headers.clone();
i += 1;
let mut iter = std::mem::take(&mut result).into_iter().peekable();

while i < result.len() {
let is_child = result[i].level > level
&& header_is_superset_of(&parent_headers, &result[i].headers);
while let Some(mut current) = iter.next() {
if current.level == level && current.length < max_length {
while let Some(next) = iter.peek() {
let is_child = next.level > level
&& header_is_superset_of(&current.headers, &next.headers);

if !is_child {
break;
}

if should_merge(current.length, result[i].length, min_length, max_length) {
let child = &result[i];
let header_prefix = "#".repeat(child.level as usize);
let child_header = child.header.as_deref().unwrap_or("");

current.text = format!(
"{}\n\n{} {}\n\n{}",
current.text, header_prefix, child_header, child.text
);
set_length(&mut current);
i += 1;
} else {
if !should_merge(current.length, next.length, min_length, max_length) {
break;
}

let child = iter.next().unwrap();
let header_prefix = &HASHES[..child.level.min(6) as usize];
let child_header = child.header.as_deref().unwrap_or("");
current.text.reserve(
2 + header_prefix.len() + 1 + child_header.len() + 2 + child.text.len(),
);
current.text.push_str("\n\n");
current.text.push_str(header_prefix);
current.text.push(' ');
current.text.push_str(child_header);
current.text.push_str("\n\n");
current.text.push_str(&child.text);
set_length(&mut current);
}

merged.push(current);
} else {
merged.push(result[i].clone());
i += 1;
merged.push(current);
}
}

Expand Down
36 changes: 19 additions & 17 deletions crate/src/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,10 @@ pub fn split_by_headers(text: &str, title: Option<&str>) -> Vec<Chunk> {
if let Some(first_match) = first_header {
let preface_content = &text_without_code[..first_match.start()];

let paragraphs: Vec<&str> = PARAGRAPH_SPLIT_REGEX
for paragraph in PARAGRAPH_SPLIT_REGEX
.split(preface_content)
.filter(|p| !p.trim().is_empty())
.collect();

for paragraph in paragraphs {
{
let restored_content = restore_code_placeholders(paragraph.trim(), &code_blocks);

let mut chunk = Chunk {
Expand Down Expand Up @@ -62,12 +60,19 @@ pub fn split_by_headers(text: &str, title: Option<&str>) -> Vec<Chunk> {
None,
];

let header_matches: Vec<_> = HEADER_REGEX.captures_iter(&text_without_code).collect();

for (i, cap) in header_matches.iter().enumerate() {
let full_match = cap.get(0).unwrap();
let header_text = cap.get(1).unwrap().as_str();
let level = header_text.chars().take_while(|&c| c == '#').count() as u32;
// Collect only the byte positions and header text slice we actually need —
// cheaper than keeping full `Captures` objects alive.
let header_matches: Vec<(usize, usize, &str)> = HEADER_REGEX
.captures_iter(&text_without_code)
.map(|cap| {
let full = cap.get(0).unwrap();
let header_text = cap.get(1).unwrap().as_str();
(full.start(), full.end(), header_text)
})
.collect();

for (i, &(_, full_end, header_text)) in header_matches.iter().enumerate() {
let level = header_text.bytes().take_while(|&b| b == b'#').count() as u32;
let header_content_raw = header_text.trim_start_matches('#').trim();
let header_content = restore_code_placeholders(header_content_raw, &code_blocks);

Expand All @@ -83,21 +88,18 @@ pub fn split_by_headers(text: &str, title: Option<&str>) -> Vec<Chunk> {

let breadcrumb = build_breadcrumb(&headers);

let content_start = full_match.end();
let content_end = if i + 1 < header_matches.len() {
header_matches[i + 1].get(0).unwrap().start()
header_matches[i + 1].0
} else {
text_without_code.len()
};

let section_content = &text_without_code[content_start..content_end];
let section_content = &text_without_code[full_end..content_end];

let paragraphs: Vec<&str> = PARAGRAPH_SPLIT_REGEX
for paragraph in PARAGRAPH_SPLIT_REGEX
.split(section_content)
.filter(|p| !p.trim().is_empty())
.collect();

for paragraph in paragraphs {
{
let restored_content = restore_code_placeholders(paragraph.trim(), &code_blocks);

let mut chunk = Chunk {
Expand Down
28 changes: 20 additions & 8 deletions crate/src/tokens.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
use regex::Regex;
use std::sync::LazyLock;

static WHITESPACE_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\s+").expect("BUG: invalid WHITESPACE_REGEX"));

/// Count characters in `text` after collapsing all runs of whitespace to a
/// single space and trimming leading/trailing whitespace.
///
/// Single-pass, zero-allocation implementation.
pub fn default_length_counter(text: &str) -> usize {
let normalized = WHITESPACE_REGEX.replace_all(text.trim(), " ");
normalized.chars().count()
let mut count = 0usize;
let mut in_ws = false;
let mut started = false;
for ch in text.chars() {
if ch.is_whitespace() {
if started {
in_ws = true;
}
} else {
if in_ws {
count += 1;
}
count += 1;
in_ws = false;
started = true;
}
}
count
}
13 changes: 7 additions & 6 deletions crate/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ use super::types::Chunk;

/// Compute character length for a chunk. Includes the breadcrumb in the count
/// because embeddings will see "breadcrumb\n\ntext" as the full input.
///
/// Counts breadcrumb and text independently then adds 1 for the `\n\n`
/// separator (which collapses to a single space under whitespace-normalization).
/// Zero allocations.
pub fn set_length(chunk: &mut Chunk) {
if chunk.breadcrumb.is_empty() {
chunk.length = default_length_counter(&chunk.text);
} else {
let text = format!("{}\n\n{}", chunk.breadcrumb, chunk.text);
chunk.length = default_length_counter(&text);
}
let b = default_length_counter(&chunk.breadcrumb);
let t = default_length_counter(&chunk.text);
chunk.length = if b == 0 || t == 0 { b + t } else { b + 1 + t };
}

/// Replace `\u{E000}CODE_BLOCK_N\u{E000}` placeholders back with the original code content.
Expand Down
36 changes: 20 additions & 16 deletions package/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,31 @@ fn map_options(options: Option<ChunkOptions>) -> Option<breadchunks::ChunkOption
})
}

fn run_batch(inputs: &[String], options: &Option<breadchunks::ChunkOptions>) -> Vec<Vec<Chunk>> {
fn run_batch(
inputs: &[String],
options: &Option<breadchunks::ChunkOptions>,
) -> Result<Vec<Vec<Chunk>>> {
inputs
.iter()
.map(|text| {
breadchunks::chunk(text, options.clone())
.into_iter()
.map(|c| Chunk {
level: c.level,
header: c.header,
headers: c.headers,
breadcrumb: c.breadcrumb,
text: c.text,
length: {
assert!(
c.length <= u32::MAX as usize,
"chunk length exceeds u32::MAX; docs >4 GiB unsupported on Node binding"
);
c.length as u32 // usize→u32 narrowing for napi; docs >4 GiB unsupported on Node binding
},
.map(|c| {
if c.length > u32::MAX as usize {
return Err(Error::from_reason(
"chunk length exceeds u32::MAX; docs >4 GiB unsupported on Node binding",
));
}
Ok(Chunk {
level: c.level,
header: c.header,
headers: c.headers,
breadcrumb: c.breadcrumb,
text: c.text,
length: c.length as u32,
})
})
.collect()
.collect::<Result<Vec<Chunk>>>()
})
.collect()
}
Expand All @@ -81,7 +85,7 @@ impl Task for ChunkTask {
TaskInput::String(s) => Ok(s),
})
.collect();
Ok(run_batch(&decoded?, &self.options))
run_batch(&decoded?, &self.options)
}

fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> {
Expand Down