kaidokert · kaidokert · Aug 3, 2025 · Aug 3, 2025
diff --git a/picojson/src/copy_on_escape.rs b/picojson/src/copy_on_escape.rs
@@ -205,6 +205,18 @@ impl<'a, 'b> CopyOnEscape<'a, 'b> {
             Ok(String::Borrowed(borrowed_str))
         }
     }
+
+    /// DataSource support methods - check if unescaped content is available
+    pub fn has_unescaped_content(&self) -> bool {
+        self.using_scratch
+    }
+
+    /// Direct access to scratch buffer with proper lifetime for DataSource implementation
+    pub fn get_scratch_contents(&'b self) -> Result<&'b [u8], ParseError> {
+        self.scratch
+            .get(self.scratch_start..self.scratch_pos)
+            .ok_or(ParseError::Unexpected(UnexpectedState::InvalidSliceBounds))
+    }
 }
 
 #[cfg(test)]

diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs
@@ -3,6 +3,14 @@
 use crate::parse_error::ParseError;
 use crate::shared::{ContentRange, UnexpectedState};
 
+/// Result type for Unicode escape sequence processing.
+///
+/// Tuple contains:
+/// - Optional UTF-8 byte array and its length
+/// - The start position of the escape sequence (\uXXXX)
+/// - The new pending high surrogate value, if any
+type UnicodeEscapeResult = (Option<([u8; 4], usize)>, usize, Option<u32>);
+
 /// Shared utilities for processing JSON escape sequences.
 /// This module contains pure functions for escape processing that can be used
 /// by both CopyOnEscape and StreamingBuffer components.
@@ -266,6 +274,16 @@ impl UnicodeEscapeCollector {
     pub fn has_pending_high_surrogate(&self) -> bool {
         self.pending_high_surrogate.is_some()
     }
+
+    /// Get the pending high surrogate value
+    pub fn get_pending_high_surrogate(&self) -> Option<u32> {
+        self.pending_high_surrogate
+    }
+
+    /// Set the pending high surrogate value
+    pub fn set_pending_high_surrogate(&mut self, surrogate: Option<u32>) {
+        self.pending_high_surrogate = surrogate;
+    }
 }
 
 impl Default for UnicodeEscapeCollector {
@@ -645,51 +663,56 @@ mod tests {
 /// Shared implementation for processing a Unicode escape sequence WITH surrogate pair support.
 ///
 /// This function centralizes the logic for handling `\uXXXX` escapes, which is
-/// common to both the pull-based and stream-based parsers. It uses a generic
-/// `hex_slice_provider` to remain independent of the underlying buffer implementation
-/// (`SliceInputBuffer` vs. `StreamBuffer`).
+/// common to all parsers. It uses the generic `DataSource` trait to remain
+/// independent of the underlying buffer implementation (`SliceInputBuffer` vs. `StreamBuffer`).
 ///
 /// # Arguments
-/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits.
-/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`.
-/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice.
-/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into.
+/// * `current_pos` - The parser's current position, right after the 4 hex digits.
+/// * `pending_high_surrogate` - The optional high surrogate from a previous escape.
+/// * `source` - A `DataSource` implementation to provide the hex digit slice.
 ///
 /// # Returns
 /// A tuple containing:
-/// - Optional UTF-8 byte slice (None if this is a high surrogate waiting for low surrogate)
-/// - The start position of the escape sequence (`\uXXXX`)
-pub(crate) fn process_unicode_escape_sequence<'a, F>(
+/// - Optional UTF-8 byte array and its length.
+/// - The start position of the escape sequence (`\uXXXX`).
+/// - The new pending high surrogate value, if any.
+pub(crate) fn process_unicode_escape_sequence<'input, 'scratch, D>(
     current_pos: usize,
-    unicode_escape_collector: &mut UnicodeEscapeCollector,
-    mut hex_slice_provider: F,
-) -> Result<(Option<([u8; 4], usize)>, usize), ParseError>
+    pending_high_surrogate: Option<u32>,
+    source: &'input D,
+) -> Result<UnicodeEscapeResult, ParseError>
 where
-    F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>,
+    D: ?Sized + crate::shared::DataSource<'input, 'scratch>,
 {
     let (hex_start, hex_end, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos);
 
-    // Extract the 4 hex digits from the buffer using the provider
-    let hex_slice = hex_slice_provider(hex_start, hex_end)?;
+    // Extract the 4 hex digits from the buffer using the DataSource
+    let hex_slice = source.get_borrowed_slice(hex_start, hex_end)?;
 
     if hex_slice.len() != 4 {
         return Err(UnexpectedState::InvalidUnicodeEscape.into());
     }
 
+    // Create a temporary collector to process the hex digits
+    let mut temp_collector = UnicodeEscapeCollector::new();
+    temp_collector.set_pending_high_surrogate(pending_high_surrogate);
+
     // Feed hex digits to the shared collector
     for &hex_digit in hex_slice {
-        unicode_escape_collector.add_hex_digit(hex_digit)?;
+        temp_collector.add_hex_digit(hex_digit)?;
     }
 
     // Check if we had a pending high surrogate before processing
-    let had_pending_high_surrogate = unicode_escape_collector.has_pending_high_surrogate();
+    let had_pending_high_surrogate = temp_collector.has_pending_high_surrogate();
 
     // Create a local buffer for the UTF-8 result
     let mut utf8_buf = [0u8; 4];
 
     // Process the complete sequence to UTF-8 with surrogate support
     let (utf8_bytes_opt, _surrogate_state_changed) =
-        unicode_escape_collector.process_to_utf8(&mut utf8_buf)?;
+        temp_collector.process_to_utf8(&mut utf8_buf)?;
+
+    let new_pending_high_surrogate = temp_collector.get_pending_high_surrogate();
 
     // If we have a result, copy it to a new array to return by value
     let result_by_value = utf8_bytes_opt.map(|bytes| {
@@ -708,5 +731,9 @@ where
         escape_start_pos
     };
 
-    Ok((result_by_value, final_escape_start_pos))
+    Ok((
+        result_by_value,
+        final_escape_start_pos,
+        new_pending_high_surrogate,
+    ))
 }
diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs
@@ -38,10 +38,25 @@ impl<T: ujson::BitBucket, C: ujson::DepthCounter> ParserCore<T, C> {
     /// This supports StreamParser-specific byte accumulation when no events are generated.
     /// SliceParser passes a no-op closure for byte_accumulator.
     pub fn next_event_impl<'a, P, F>(
+        &mut self,
+        provider: &'a mut P,
+        escape_timing: EscapeTiming,
+        byte_accumulator: F,
+    ) -> Result<Event<'a, 'a>, ParseError>
+    where
+        P: ContentExtractor,
+        F: FnMut(&mut P, u8) -> Result<(), ParseError>,
+    {
+        self.next_event_impl_with_flags(provider, escape_timing, byte_accumulator, false)
+    }
+
+    /// Extended version with flags for specialized behavior
+    pub fn next_event_impl_with_flags<'a, P, F>(
         &mut self,
         provider: &'a mut P,
         escape_timing: EscapeTiming,
         mut byte_accumulator: F,
+        always_accumulate_during_escapes: bool,
     ) -> Result<Event<'a, 'a>, ParseError>
     where
         P: ContentExtractor,
@@ -58,20 +73,27 @@ impl<T: ujson::BitBucket, C: ujson::DepthCounter> ParserCore<T, C> {
                             .map_err(ParseError::TokenizerError)?;
                     }
 
-                    // Call byte accumulator if no events were generated AND we are not in an escape sequence
-                    if !have_events(&self.parser_state.evts) && !self.in_escape_sequence {
+                    let should_accumulate = if always_accumulate_during_escapes {
+                        if self.in_escape_sequence {
+                            true // Always accumulate during escape sequences
+                        } else {
+                            !have_events(&self.parser_state.evts) // Normal behavior outside escapes
+                        }
+                    } else {
+                        !have_events(&self.parser_state.evts) && !self.in_escape_sequence
+                    };
+
+                    if should_accumulate {
                         byte_accumulator(provider, byte)?;
                     }
                 } else {
-                    // Handle end of stream
                     {
-                        clear_events(&mut self.parser_state.evts);
-                        let mut callback = create_tokenizer_callback(&mut self.parser_state.evts);
-                        self.tokenizer
-                            .finish(&mut callback)
-                            .map_err(ParseError::TokenizerError)?;
-                    }
+                        let mut finish_callback =
+                            create_tokenizer_callback(&mut self.parser_state.evts);
+                        let _bytes_processed = self.tokenizer.finish(&mut finish_callback)?;
+                    } // Drop the callback to release the borrow
 
+                    // If finish() generated events, process them. Otherwise, return EndDocument.
                     if !have_events(&self.parser_state.evts) {
                         return Ok(Event::EndDocument);
                     }

diff --git a/picojson/src/parse_error.rs b/picojson/src/parse_error.rs
@@ -11,10 +11,12 @@ use crate::ujson;
 pub enum ParseError {
     /// An error bubbled up from the underlying tokenizer.
     TokenizerError(ujson::Error),
-    /// The provided scratch buffer was not large enough for an operation.
+    /// The scratch buffer is full.
     ScratchBufferFull,
-    /// A string slice was not valid UTF-8.
+    /// A UTF-8 error occurred.
     InvalidUtf8(core::str::Utf8Error),
+    /// The input buffer is full.
+    InputBufferFull,
     /// A number string could not be parsed.
     InvalidNumber,
     /// The parser entered an unexpected internal state.
@@ -73,6 +75,12 @@ impl From<UnexpectedState> for ParseError {
     }
 }
 
+impl From<ujson::Error> for ParseError {
+    fn from(err: ujson::Error) -> Self {
+        ParseError::TokenizerError(err)
+    }
+}
+
 impl core::fmt::Display for ParseError {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         match self {

diff --git a/picojson/src/shared.rs b/picojson/src/shared.rs
@@ -120,7 +120,11 @@ impl ContentRange {
         current_pos: usize,
     ) -> (usize, usize) {
         let content_end = current_pos.saturating_sub(1); // Back up to exclude closing quote
-        (content_start, content_end)
+        if content_start > content_end {
+            (content_start, content_start)
+        } else {
+            (content_start, content_end)
+        }
     }
 
     /// Calculate Unicode escape sequence boundaries
@@ -171,6 +175,100 @@ impl ContentRange {
     }
 }
 
+/// A trait that abstracts the source of JSON data for content extraction.
+///
+/// This trait provides a unified interface for accessing both borrowed content from
+/// the original input data and unescaped content from temporary scratch buffers.
+/// It enables consistent content extraction patterns across different parser types.
+///
+/// # Generic Parameters
+///
+/// * `'input` - Lifetime for the input data being parsed
+/// * `'scratch` - Lifetime for the scratch buffer used for temporary storage
+pub trait DataSource<'input, 'scratch> {
+    /// Returns a slice of the raw, unprocessed input data from a specific range.
+    /// Used for zero-copy extraction of content that contains no escape sequences.
+    ///
+    /// # Arguments
+    /// * `start` - Start position in the input data
+    /// * `end` - End position in the input data (exclusive)
+    ///
+    /// # Returns
+    /// A slice of the input data with lifetime `'input`
+    fn get_borrowed_slice(
+        &'input self,
+        start: usize,
+        end: usize,
+    ) -> Result<&'input [u8], ParseError>;
+
+    /// Returns the full slice of the processed, unescaped content from the scratch buffer.
+    /// Used when escape sequences have been processed and content written to temporary buffer.
+    ///
+    /// # Returns
+    /// A slice of unescaped content with lifetime `'scratch`
+    fn get_unescaped_slice(&'scratch self) -> Result<&'scratch [u8], ParseError>;
+
+    /// Check if unescaped content is available in the scratch buffer.
+    ///
+    /// # Returns
+    /// `true` if unescaped content exists and should be accessed via `get_unescaped_slice()`,
+    /// `false` if content should be accessed via `get_borrowed_slice()`
+    fn has_unescaped_content(&self) -> bool;
+}
+
+/// Raw content piece from either input buffer or scratch buffer.
+/// This enum cleanly separates the two different content sources without
+/// coupling the DataSource trait to high-level JSON types.
+#[derive(Debug, PartialEq)]
+pub enum ContentPiece<'input, 'scratch> {
+    /// Content borrowed directly from the input buffer (zero-copy)
+    Input(&'input [u8]),
+    /// Content processed and stored in the scratch buffer (unescaped)
+    Scratch(&'scratch [u8]),
+}
+
+impl<'input, 'scratch> ContentPiece<'input, 'scratch>
+where
+    'input: 'scratch,
+{
+    /// Convert the content piece to a String enum
+    pub fn into_string(self) -> Result<String<'input, 'scratch>, ParseError> {
+        match self {
+            ContentPiece::Input(bytes) => {
+                let content_str = from_utf8(bytes)?;
+                Ok(String::Borrowed(content_str))
+            }
+            ContentPiece::Scratch(bytes) => {
+                let content_str = from_utf8(bytes)?;
+                Ok(String::Unescaped(content_str))
+            }
+        }
+    }
+}
+
 pub fn from_utf8(v: &[u8]) -> Result<&str, ParseError> {
     core::str::from_utf8(v).map_err(Into::into)
 }
+
+/// A generic helper function that uses the DataSource trait to extract the correct
+/// content piece (either borrowed or from scratch). This consolidates the core
+/// extraction logic for all parsers.
+pub fn get_content_piece<'input, 'scratch, D>(
+    source: &'input D,
+    start_pos: usize,
+    current_pos: usize,
+) -> Result<ContentPiece<'input, 'scratch>, ParseError>
+where
+    'input: 'scratch,
+    D: ?Sized + DataSource<'input, 'scratch>,
+{
+    if source.has_unescaped_content() {
+        source.get_unescaped_slice().map(ContentPiece::Scratch)
+    } else {
+        let (content_start, content_end) =
+            ContentRange::string_content_bounds_from_content_start(start_pos, current_pos);
+        source
+            .get_borrowed_slice(content_start, content_end)
+            .map(ContentPiece::Input)
+    }
+}