jonathanong · jonathanong · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/crate/src/merge.rs b/crate/src/merge.rs
@@ -1,6 +1,8 @@
 use super::types::Chunk;
 use super::utils::{header_is_superset_of, set_length};
 
+const HASHES: &str = "######";
+
 fn should_merge(a_length: usize, b_length: usize, min_length: usize, max_length: usize) -> bool {
     if a_length >= min_length && b_length >= min_length {
         return false;
@@ -26,7 +28,9 @@ pub fn merge_phase2(chunks: Vec<Chunk>, min_length: usize, max_length: usize) ->
                 if prev.breadcrumb == chunk.breadcrumb
                     && should_merge(prev.length, chunk.length, min_length, max_length)
                 {
-                    prev.text = format!("{}\n\n{}", prev.text, chunk.text);
+                    prev.text.reserve(chunk.text.len() + 2);
+                    prev.text.push_str("\n\n");
+                    prev.text.push_str(&chunk.text);
                     set_length(&mut prev);
                     current = Some(prev);
                     continue;
@@ -58,42 +62,40 @@ pub fn merge_phase3(chunks: Vec<Chunk>, min_length: usize, max_length: usize) ->
 
     for level in (1..=6).rev() {
         let mut merged = Vec::with_capacity(result.len());
-        let mut i = 0;
-
-        while i < result.len() {
-            if result[i].level == level && result[i].length < max_length {
-                let mut current = result[i].clone();
-                let parent_headers = current.headers.clone();
-                i += 1;
+        let mut iter = std::mem::take(&mut result).into_iter().peekable();
 
-                while i < result.len() {
-                    let is_child = result[i].level > level
-                        && header_is_superset_of(&parent_headers, &result[i].headers);
+        while let Some(mut current) = iter.next() {
+            if current.level == level && current.length < max_length {
+                while let Some(next) = iter.peek() {
+                    let is_child = next.level > level
+                        && header_is_superset_of(&current.headers, &next.headers);
 
                     if !is_child {
                         break;
                     }
 
-                    if should_merge(current.length, result[i].length, min_length, max_length) {
-                        let child = &result[i];
-                        let header_prefix = "#".repeat(child.level as usize);
-                        let child_header = child.header.as_deref().unwrap_or("");
-
-                        current.text = format!(
-                            "{}\n\n{} {}\n\n{}",
-                            current.text, header_prefix, child_header, child.text
-                        );
-                        set_length(&mut current);
-                        i += 1;
-                    } else {
+                    if !should_merge(current.length, next.length, min_length, max_length) {
                         break;
                     }
+
+                    let child = iter.next().unwrap();
+                    let header_prefix = &HASHES[..child.level.min(6) as usize];
+                    let child_header = child.header.as_deref().unwrap_or("");
+                    current.text.reserve(
+                        2 + header_prefix.len() + 1 + child_header.len() + 2 + child.text.len(),
+                    );
+                    current.text.push_str("\n\n");
+                    current.text.push_str(header_prefix);
+                    current.text.push(' ');
+                    current.text.push_str(child_header);
+                    current.text.push_str("\n\n");
+                    current.text.push_str(&child.text);
+                    set_length(&mut current);
                 }
 
                 merged.push(current);
             } else {
-                merged.push(result[i].clone());
-                i += 1;
+                merged.push(current);
             }
         }
 

diff --git a/crate/src/split.rs b/crate/src/split.rs
@@ -24,12 +24,10 @@ pub fn split_by_headers(text: &str, title: Option<&str>) -> Vec<Chunk> {
     if let Some(first_match) = first_header {
         let preface_content = &text_without_code[..first_match.start()];
 
-        let paragraphs: Vec<&str> = PARAGRAPH_SPLIT_REGEX
+        for paragraph in PARAGRAPH_SPLIT_REGEX
             .split(preface_content)
             .filter(|p| !p.trim().is_empty())
-            .collect();
-
-        for paragraph in paragraphs {
+        {
             let restored_content = restore_code_placeholders(paragraph.trim(), &code_blocks);
 
             let mut chunk = Chunk {
@@ -62,12 +60,19 @@ pub fn split_by_headers(text: &str, title: Option<&str>) -> Vec<Chunk> {
         None,
     ];
 
-    let header_matches: Vec<_> = HEADER_REGEX.captures_iter(&text_without_code).collect();
-
-    for (i, cap) in header_matches.iter().enumerate() {
-        let full_match = cap.get(0).unwrap();
-        let header_text = cap.get(1).unwrap().as_str();
-        let level = header_text.chars().take_while(|&c| c == '#').count() as u32;
+    // Collect only the byte positions and header text slice we actually need —
+    // cheaper than keeping full `Captures` objects alive.
+    let header_matches: Vec<(usize, usize, &str)> = HEADER_REGEX
+        .captures_iter(&text_without_code)
+        .map(|cap| {
+            let full = cap.get(0).unwrap();
+            let header_text = cap.get(1).unwrap().as_str();
+            (full.start(), full.end(), header_text)
+        })
+        .collect();
+
+    for (i, &(_, full_end, header_text)) in header_matches.iter().enumerate() {
+        let level = header_text.bytes().take_while(|&b| b == b'#').count() as u32;
         let header_content_raw = header_text.trim_start_matches('#').trim();
         let header_content = restore_code_placeholders(header_content_raw, &code_blocks);
 
@@ -83,21 +88,18 @@ pub fn split_by_headers(text: &str, title: Option<&str>) -> Vec<Chunk> {
 
         let breadcrumb = build_breadcrumb(&headers);
 
-        let content_start = full_match.end();
         let content_end = if i + 1 < header_matches.len() {
-            header_matches[i + 1].get(0).unwrap().start()
+            header_matches[i + 1].0
         } else {
             text_without_code.len()
         };
 
-        let section_content = &text_without_code[content_start..content_end];
+        let section_content = &text_without_code[full_end..content_end];
 
-        let paragraphs: Vec<&str> = PARAGRAPH_SPLIT_REGEX
+        for paragraph in PARAGRAPH_SPLIT_REGEX
             .split(section_content)
             .filter(|p| !p.trim().is_empty())
-            .collect();
-
-        for paragraph in paragraphs {
+        {
             let restored_content = restore_code_placeholders(paragraph.trim(), &code_blocks);
 
             let mut chunk = Chunk {

diff --git a/crate/src/tokens.rs b/crate/src/tokens.rs
@@ -1,12 +1,24 @@
-use regex::Regex;
-use std::sync::LazyLock;
-
-static WHITESPACE_REGEX: LazyLock<Regex> =
-    LazyLock::new(|| Regex::new(r"\s+").expect("BUG: invalid WHITESPACE_REGEX"));
-
 /// Count characters in `text` after collapsing all runs of whitespace to a
 /// single space and trimming leading/trailing whitespace.
+///
+/// Single-pass, zero-allocation implementation.
 pub fn default_length_counter(text: &str) -> usize {
-    let normalized = WHITESPACE_REGEX.replace_all(text.trim(), " ");
-    normalized.chars().count()
+    let mut count = 0usize;
+    let mut in_ws = false;
+    let mut started = false;
+    for ch in text.chars() {
+        if ch.is_whitespace() {
+            if started {
+                in_ws = true;
+            }
+        } else {
+            if in_ws {
+                count += 1;
+            }
+            count += 1;
+            in_ws = false;
+            started = true;
+        }
+    }
+    count
 }
diff --git a/crate/src/utils.rs b/crate/src/utils.rs
@@ -3,13 +3,14 @@ use super::types::Chunk;
 
 /// Compute character length for a chunk. Includes the breadcrumb in the count
 /// because embeddings will see "breadcrumb\n\ntext" as the full input.
+///
+/// Counts breadcrumb and text independently then adds 1 for the `\n\n`
+/// separator (which collapses to a single space under whitespace-normalization).
+/// Zero allocations.
 pub fn set_length(chunk: &mut Chunk) {
-    if chunk.breadcrumb.is_empty() {
-        chunk.length = default_length_counter(&chunk.text);
-    } else {
-        let text = format!("{}\n\n{}", chunk.breadcrumb, chunk.text);
-        chunk.length = default_length_counter(&text);
-    }
+    let b = default_length_counter(&chunk.breadcrumb);
+    let t = default_length_counter(&chunk.text);
+    chunk.length = if b == 0 || t == 0 { b + t } else { b + 1 + t };
 }
 
 /// Replace `\u{E000}CODE_BLOCK_N\u{E000}` placeholders back with the original code content.

diff --git a/package/src/lib.rs b/package/src/lib.rs
@@ -37,27 +37,31 @@ fn map_options(options: Option<ChunkOptions>) -> Option<breadchunks::ChunkOption
     })
 }
 
-fn run_batch(inputs: &[String], options: &Option<breadchunks::ChunkOptions>) -> Vec<Vec<Chunk>> {
+fn run_batch(
+    inputs: &[String],
+    options: &Option<breadchunks::ChunkOptions>,
+) -> Result<Vec<Vec<Chunk>>> {
     inputs
         .iter()
         .map(|text| {
             breadchunks::chunk(text, options.clone())
                 .into_iter()
-                .map(|c| Chunk {
-                    level: c.level,
-                    header: c.header,
-                    headers: c.headers,
-                    breadcrumb: c.breadcrumb,
-                    text: c.text,
-                    length: {
-                        assert!(
-                            c.length <= u32::MAX as usize,
-                            "chunk length exceeds u32::MAX; docs >4 GiB unsupported on Node binding"
-                        );
-                        c.length as u32 // usize→u32 narrowing for napi; docs >4 GiB unsupported on Node binding
-                    },
+                .map(|c| {
+                    if c.length > u32::MAX as usize {
+                        return Err(Error::from_reason(
+                            "chunk length exceeds u32::MAX; docs >4 GiB unsupported on Node binding",
+                        ));
+                    }
+                    Ok(Chunk {
+                        level: c.level,
+                        header: c.header,
+                        headers: c.headers,
+                        breadcrumb: c.breadcrumb,
+                        text: c.text,
+                        length: c.length as u32,
+                    })
                 })
-                .collect()
+                .collect::<Result<Vec<Chunk>>>()
         })
         .collect()
 }
@@ -81,7 +85,7 @@ impl Task for ChunkTask {
                 TaskInput::String(s) => Ok(s),
             })
             .collect();
-        Ok(run_batch(&decoded?, &self.options))
+        run_batch(&decoded?, &self.options)
     }
 
     fn resolve(&mut self, _env: Env, output: Self::Output) -> Result<Self::JsValue> {