diff --git a/picojson/src/copy_on_escape.rs b/picojson/src/copy_on_escape.rs index 24cb183..c568e6c 100644 --- a/picojson/src/copy_on_escape.rs +++ b/picojson/src/copy_on_escape.rs @@ -205,6 +205,18 @@ impl<'a, 'b> CopyOnEscape<'a, 'b> { Ok(String::Borrowed(borrowed_str)) } } + + /// DataSource support methods - check if unescaped content is available + pub fn has_unescaped_content(&self) -> bool { + self.using_scratch + } + + /// Direct access to scratch buffer with proper lifetime for DataSource implementation + pub fn get_scratch_contents(&'b self) -> Result<&'b [u8], ParseError> { + self.scratch + .get(self.scratch_start..self.scratch_pos) + .ok_or(ParseError::Unexpected(UnexpectedState::InvalidSliceBounds)) + } } #[cfg(test)] diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs index f8e5cb6..f50a705 100644 --- a/picojson/src/escape_processor.rs +++ b/picojson/src/escape_processor.rs @@ -3,6 +3,14 @@ use crate::parse_error::ParseError; use crate::shared::{ContentRange, UnexpectedState}; +/// Result type for Unicode escape sequence processing. +/// +/// Tuple contains: +/// - Optional UTF-8 byte array and its length +/// - The start position of the escape sequence (\uXXXX) +/// - The new pending high surrogate value, if any +type UnicodeEscapeResult = (Option<([u8; 4], usize)>, usize, Option); + /// Shared utilities for processing JSON escape sequences. /// This module contains pure functions for escape processing that can be used /// by both CopyOnEscape and StreamingBuffer components. @@ -266,6 +274,16 @@ impl UnicodeEscapeCollector { pub fn has_pending_high_surrogate(&self) -> bool { self.pending_high_surrogate.is_some() } + + /// Get the pending high surrogate value + pub fn get_pending_high_surrogate(&self) -> Option { + self.pending_high_surrogate + } + + /// Set the pending high surrogate value + pub fn set_pending_high_surrogate(&mut self, surrogate: Option) { + self.pending_high_surrogate = surrogate; + } } impl Default for UnicodeEscapeCollector { @@ -645,51 +663,56 @@ mod tests { /// Shared implementation for processing a Unicode escape sequence WITH surrogate pair support. /// /// This function centralizes the logic for handling `\uXXXX` escapes, which is -/// common to both the pull-based and stream-based parsers. It uses a generic -/// `hex_slice_provider` to remain independent of the underlying buffer implementation -/// (`SliceInputBuffer` vs. `StreamBuffer`). +/// common to all parsers. It uses the generic `DataSource` trait to remain +/// independent of the underlying buffer implementation (`SliceInputBuffer` vs. `StreamBuffer`). /// /// # Arguments -/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits. -/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`. -/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice. -/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into. +/// * `current_pos` - The parser's current position, right after the 4 hex digits. +/// * `pending_high_surrogate` - The optional high surrogate from a previous escape. +/// * `source` - A `DataSource` implementation to provide the hex digit slice. /// /// # Returns /// A tuple containing: -/// - Optional UTF-8 byte slice (None if this is a high surrogate waiting for low surrogate) -/// - The start position of the escape sequence (`\uXXXX`) -pub(crate) fn process_unicode_escape_sequence<'a, F>( +/// - Optional UTF-8 byte array and its length. +/// - The start position of the escape sequence (`\uXXXX`). +/// - The new pending high surrogate value, if any. +pub(crate) fn process_unicode_escape_sequence<'input, 'scratch, D>( current_pos: usize, - unicode_escape_collector: &mut UnicodeEscapeCollector, - mut hex_slice_provider: F, -) -> Result<(Option<([u8; 4], usize)>, usize), ParseError> + pending_high_surrogate: Option, + source: &'input D, +) -> Result where - F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>, + D: ?Sized + crate::shared::DataSource<'input, 'scratch>, { let (hex_start, hex_end, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos); - // Extract the 4 hex digits from the buffer using the provider - let hex_slice = hex_slice_provider(hex_start, hex_end)?; + // Extract the 4 hex digits from the buffer using the DataSource + let hex_slice = source.get_borrowed_slice(hex_start, hex_end)?; if hex_slice.len() != 4 { return Err(UnexpectedState::InvalidUnicodeEscape.into()); } + // Create a temporary collector to process the hex digits + let mut temp_collector = UnicodeEscapeCollector::new(); + temp_collector.set_pending_high_surrogate(pending_high_surrogate); + // Feed hex digits to the shared collector for &hex_digit in hex_slice { - unicode_escape_collector.add_hex_digit(hex_digit)?; + temp_collector.add_hex_digit(hex_digit)?; } // Check if we had a pending high surrogate before processing - let had_pending_high_surrogate = unicode_escape_collector.has_pending_high_surrogate(); + let had_pending_high_surrogate = temp_collector.has_pending_high_surrogate(); // Create a local buffer for the UTF-8 result let mut utf8_buf = [0u8; 4]; // Process the complete sequence to UTF-8 with surrogate support let (utf8_bytes_opt, _surrogate_state_changed) = - unicode_escape_collector.process_to_utf8(&mut utf8_buf)?; + temp_collector.process_to_utf8(&mut utf8_buf)?; + + let new_pending_high_surrogate = temp_collector.get_pending_high_surrogate(); // If we have a result, copy it to a new array to return by value let result_by_value = utf8_bytes_opt.map(|bytes| { @@ -708,5 +731,9 @@ where escape_start_pos }; - Ok((result_by_value, final_escape_start_pos)) + Ok(( + result_by_value, + final_escape_start_pos, + new_pending_high_surrogate, + )) } diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index b49c5fd..fc0d452 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -38,10 +38,25 @@ impl ParserCore { /// This supports StreamParser-specific byte accumulation when no events are generated. /// SliceParser passes a no-op closure for byte_accumulator. pub fn next_event_impl<'a, P, F>( + &mut self, + provider: &'a mut P, + escape_timing: EscapeTiming, + byte_accumulator: F, + ) -> Result, ParseError> + where + P: ContentExtractor, + F: FnMut(&mut P, u8) -> Result<(), ParseError>, + { + self.next_event_impl_with_flags(provider, escape_timing, byte_accumulator, false) + } + + /// Extended version with flags for specialized behavior + pub fn next_event_impl_with_flags<'a, P, F>( &mut self, provider: &'a mut P, escape_timing: EscapeTiming, mut byte_accumulator: F, + always_accumulate_during_escapes: bool, ) -> Result, ParseError> where P: ContentExtractor, @@ -58,20 +73,27 @@ impl ParserCore { .map_err(ParseError::TokenizerError)?; } - // Call byte accumulator if no events were generated AND we are not in an escape sequence - if !have_events(&self.parser_state.evts) && !self.in_escape_sequence { + let should_accumulate = if always_accumulate_during_escapes { + if self.in_escape_sequence { + true // Always accumulate during escape sequences + } else { + !have_events(&self.parser_state.evts) // Normal behavior outside escapes + } + } else { + !have_events(&self.parser_state.evts) && !self.in_escape_sequence + }; + + if should_accumulate { byte_accumulator(provider, byte)?; } } else { - // Handle end of stream { - clear_events(&mut self.parser_state.evts); - let mut callback = create_tokenizer_callback(&mut self.parser_state.evts); - self.tokenizer - .finish(&mut callback) - .map_err(ParseError::TokenizerError)?; - } + let mut finish_callback = + create_tokenizer_callback(&mut self.parser_state.evts); + let _bytes_processed = self.tokenizer.finish(&mut finish_callback)?; + } // Drop the callback to release the borrow + // If finish() generated events, process them. Otherwise, return EndDocument. if !have_events(&self.parser_state.evts) { return Ok(Event::EndDocument); } diff --git a/picojson/src/parse_error.rs b/picojson/src/parse_error.rs index 15801e5..bdc3261 100644 --- a/picojson/src/parse_error.rs +++ b/picojson/src/parse_error.rs @@ -11,10 +11,12 @@ use crate::ujson; pub enum ParseError { /// An error bubbled up from the underlying tokenizer. TokenizerError(ujson::Error), - /// The provided scratch buffer was not large enough for an operation. + /// The scratch buffer is full. ScratchBufferFull, - /// A string slice was not valid UTF-8. + /// A UTF-8 error occurred. InvalidUtf8(core::str::Utf8Error), + /// The input buffer is full. + InputBufferFull, /// A number string could not be parsed. InvalidNumber, /// The parser entered an unexpected internal state. @@ -73,6 +75,12 @@ impl From for ParseError { } } +impl From for ParseError { + fn from(err: ujson::Error) -> Self { + ParseError::TokenizerError(err) + } +} + impl core::fmt::Display for ParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { diff --git a/picojson/src/shared.rs b/picojson/src/shared.rs index 16046a9..3829bb0 100644 --- a/picojson/src/shared.rs +++ b/picojson/src/shared.rs @@ -120,7 +120,11 @@ impl ContentRange { current_pos: usize, ) -> (usize, usize) { let content_end = current_pos.saturating_sub(1); // Back up to exclude closing quote - (content_start, content_end) + if content_start > content_end { + (content_start, content_start) + } else { + (content_start, content_end) + } } /// Calculate Unicode escape sequence boundaries @@ -171,6 +175,100 @@ impl ContentRange { } } +/// A trait that abstracts the source of JSON data for content extraction. +/// +/// This trait provides a unified interface for accessing both borrowed content from +/// the original input data and unescaped content from temporary scratch buffers. +/// It enables consistent content extraction patterns across different parser types. +/// +/// # Generic Parameters +/// +/// * `'input` - Lifetime for the input data being parsed +/// * `'scratch` - Lifetime for the scratch buffer used for temporary storage +pub trait DataSource<'input, 'scratch> { + /// Returns a slice of the raw, unprocessed input data from a specific range. + /// Used for zero-copy extraction of content that contains no escape sequences. + /// + /// # Arguments + /// * `start` - Start position in the input data + /// * `end` - End position in the input data (exclusive) + /// + /// # Returns + /// A slice of the input data with lifetime `'input` + fn get_borrowed_slice( + &'input self, + start: usize, + end: usize, + ) -> Result<&'input [u8], ParseError>; + + /// Returns the full slice of the processed, unescaped content from the scratch buffer. + /// Used when escape sequences have been processed and content written to temporary buffer. + /// + /// # Returns + /// A slice of unescaped content with lifetime `'scratch` + fn get_unescaped_slice(&'scratch self) -> Result<&'scratch [u8], ParseError>; + + /// Check if unescaped content is available in the scratch buffer. + /// + /// # Returns + /// `true` if unescaped content exists and should be accessed via `get_unescaped_slice()`, + /// `false` if content should be accessed via `get_borrowed_slice()` + fn has_unescaped_content(&self) -> bool; +} + +/// Raw content piece from either input buffer or scratch buffer. +/// This enum cleanly separates the two different content sources without +/// coupling the DataSource trait to high-level JSON types. +#[derive(Debug, PartialEq)] +pub enum ContentPiece<'input, 'scratch> { + /// Content borrowed directly from the input buffer (zero-copy) + Input(&'input [u8]), + /// Content processed and stored in the scratch buffer (unescaped) + Scratch(&'scratch [u8]), +} + +impl<'input, 'scratch> ContentPiece<'input, 'scratch> +where + 'input: 'scratch, +{ + /// Convert the content piece to a String enum + pub fn into_string(self) -> Result, ParseError> { + match self { + ContentPiece::Input(bytes) => { + let content_str = from_utf8(bytes)?; + Ok(String::Borrowed(content_str)) + } + ContentPiece::Scratch(bytes) => { + let content_str = from_utf8(bytes)?; + Ok(String::Unescaped(content_str)) + } + } + } +} + pub fn from_utf8(v: &[u8]) -> Result<&str, ParseError> { core::str::from_utf8(v).map_err(Into::into) } + +/// A generic helper function that uses the DataSource trait to extract the correct +/// content piece (either borrowed or from scratch). This consolidates the core +/// extraction logic for all parsers. +pub fn get_content_piece<'input, 'scratch, D>( + source: &'input D, + start_pos: usize, + current_pos: usize, +) -> Result, ParseError> +where + 'input: 'scratch, + D: ?Sized + DataSource<'input, 'scratch>, +{ + if source.has_unescaped_content() { + source.get_unescaped_slice().map(ContentPiece::Scratch) + } else { + let (content_start, content_end) = + ContentRange::string_content_bounds_from_content_start(start_pos, current_pos); + source + .get_borrowed_slice(content_start, content_end) + .map(ContentPiece::Input) + } +} diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs index d76739c..d5c463f 100644 --- a/picojson/src/slice_content_builder.rs +++ b/picojson/src/slice_content_builder.rs @@ -5,7 +5,7 @@ use crate::copy_on_escape::CopyOnEscape; use crate::escape_processor::{self, UnicodeEscapeCollector}; use crate::event_processor::ContentExtractor; -use crate::shared::{ContentRange, State}; +use crate::shared::{ContentRange, DataSource, State}; use crate::slice_input_buffer::{InputBuffer, SliceInputBuffer}; use crate::{Event, JsonNumber, ParseError}; @@ -68,42 +68,45 @@ impl ContentExtractor for SliceContentBuilder<'_, '_> { &mut self.unicode_escape_collector } - fn extract_string_content(&mut self, _start_pos: usize) -> Result, ParseError> { - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let value_result = self.copy_on_escape.end_string(end_pos)?; - Ok(Event::String(value_result)) + fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { + // SliceParser-specific: Complete CopyOnEscape processing for unescaped content + let current_pos = self.current_position(); + if self.has_unescaped_content() { + let end_pos = ContentRange::end_position_excluding_delimiter(current_pos); + self.copy_on_escape.end_string(end_pos)?; // Complete the CopyOnEscape processing + } + + // Use the unified helper function to get the content + let content_piece = crate::shared::get_content_piece(self, start_pos, current_pos)?; + Ok(Event::String(content_piece.into_string()?)) } - fn extract_key_content(&mut self, _start_pos: usize) -> Result, ParseError> { - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let key_result = self.copy_on_escape.end_string(end_pos)?; - Ok(Event::Key(key_result)) + fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { + // SliceParser-specific: Complete CopyOnEscape processing for unescaped content + let current_pos = self.current_position(); + if self.has_unescaped_content() { + let end_pos = ContentRange::end_position_excluding_delimiter(current_pos); + self.copy_on_escape.end_string(end_pos)?; // Complete the CopyOnEscape processing + } + + // Use the unified helper function to get the content + let content_piece = crate::shared::get_content_piece(self, start_pos, current_pos)?; + Ok(Event::Key(content_piece.into_string()?)) } fn extract_number( &mut self, start_pos: usize, - from_container_end: bool, + _from_container_end: bool, _finished: bool, ) -> Result, ParseError> { - // For SliceParser, use buffer-based document end detection - // The finished parameter should always be true for complete slices, but we don't rely on it - let at_document_end = self.buffer.current_pos() >= self.buffer.data_len(); - let current_pos = self.buffer.current_pos(); - let use_full_span = !from_container_end && at_document_end; - - let end_pos = if use_full_span { - // Standalone number: clamp to buffer length to prevent slice bounds errors - core::cmp::min(current_pos, self.buffer.data_len()) - } else { - // Container number: exclude delimiter - current_pos.saturating_sub(1) - }; - - let number_bytes = self - .buffer - .slice(start_pos, end_pos) - .map_err(|_| ParseError::InvalidNumber)?; + // The delimiter has already been consumed by the time this is called, + // so current_position is one byte past the end of the number. + let end_pos = ContentRange::end_position_excluding_delimiter(self.current_position()); + + // Use the DataSource trait method to get the number bytes + let number_bytes = self.get_borrowed_slice(start_pos, end_pos)?; + let json_number = JsonNumber::from_slice(number_bytes)?; Ok(Event::Number(json_number)) } @@ -118,29 +121,37 @@ impl ContentExtractor for SliceContentBuilder<'_, '_> { fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { let current_pos = self.buffer.current_pos(); - let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); // Shared Unicode escape processing pattern let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); + let pending_surrogate = self.unicode_escape_collector.get_pending_high_surrogate(); - let (utf8_bytes_result, escape_start_pos) = + let (utf8_bytes_result, escape_start_pos, new_pending_surrogate) = escape_processor::process_unicode_escape_sequence( current_pos, - &mut self.unicode_escape_collector, - hex_slice_provider, + pending_surrogate, + self, // Pass self as the DataSource )?; + self.unicode_escape_collector + .set_pending_high_surrogate(new_pending_surrogate); + // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) if let Some((utf8_bytes, len)) = utf8_bytes_result { - let utf8_slice = &utf8_bytes[..len]; + // Ensure we don't exceed array bounds + let safe_len = len.min(utf8_bytes.len()).min(4); + let utf8_slice = &utf8_bytes[..safe_len]; if had_pending_high_surrogate { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape .handle_unicode_escape(escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 + let low_surrogate_pos = escape_start_pos + .checked_add(6) + .ok_or(ParseError::NumericOverflow)?; self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_slice)?; + .handle_unicode_escape(low_surrogate_pos, utf8_slice)?; } else { // Single Unicode escape - normal processing self.copy_on_escape @@ -162,3 +173,28 @@ impl ContentExtractor for SliceContentBuilder<'_, '_> { Ok(()) } } + +/// DataSource implementation for SliceContentBuilder +/// +/// This implementation provides access to both borrowed content from the original +/// input slice and unescaped content from the CopyOnEscape scratch buffer. +impl<'a, 'b> DataSource<'a, 'b> for SliceContentBuilder<'a, 'b> { + fn get_borrowed_slice(&'a self, start: usize, end: usize) -> Result<&'a [u8], ParseError> { + self.buffer.slice(start, end).map_err(Into::into) + } + + fn get_unescaped_slice(&'b self) -> Result<&'b [u8], ParseError> { + // Access the scratch buffer directly with the correct lifetime + if !self.copy_on_escape.has_unescaped_content() { + return Err(ParseError::Unexpected( + crate::shared::UnexpectedState::StateMismatch, + )); + } + + self.copy_on_escape.get_scratch_contents() + } + + fn has_unescaped_content(&self) -> bool { + self.copy_on_escape.has_unescaped_content() + } +} diff --git a/picojson/src/slice_input_buffer.rs b/picojson/src/slice_input_buffer.rs index 781c075..e269d48 100644 --- a/picojson/src/slice_input_buffer.rs +++ b/picojson/src/slice_input_buffer.rs @@ -52,11 +52,6 @@ impl<'a> SliceInputBuffer<'a> { pub fn slice(&self, start: usize, end: usize) -> Result<&'a [u8], Error> { self.data.get(start..end).ok_or(Error::InvalidSliceBounds) } - - /// Gets the length of the underlying data for bounds checking. - pub fn data_len(&self) -> usize { - self.data.len() - } } #[cfg(test)] diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index b4c79c4..28dbf09 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -4,10 +4,10 @@ use crate::escape_processor::UnicodeEscapeCollector; use crate::event_processor::ContentExtractor; -use crate::shared::{ContentRange, State}; +use crate::shared::{ContentRange, DataSource, State}; use crate::stream_buffer::StreamBuffer; use crate::stream_parser::Reader; -use crate::{Event, JsonNumber, ParseError, String}; +use crate::{Event, JsonNumber, ParseError}; /// ContentBuilder implementation for StreamParser that uses StreamBuffer for streaming and escape processing pub struct StreamContentBuilder<'b, R: Reader> { @@ -56,8 +56,8 @@ impl<'b, R: Reader> StreamContentBuilder<'b, R> { .map_err(ParseError::from)?; if compaction_offset == 0 { - // SOL: Buffer too small for current token - return Err(ParseError::ScratchBufferFull); + // Buffer too small for current token - this is an input buffer size issue + return Err(ParseError::InputBufferFull); } // Update parser state positions after compaction (original logic) @@ -118,30 +118,6 @@ impl<'b, R: Reader> StreamContentBuilder<'b, R> { self.unescaped_reset_queued = true; } - /// Helper to create an unescaped string from StreamBuffer - fn create_unescaped_string(&mut self) -> Result, ParseError> { - self.queue_unescaped_reset(); - let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; - let str_content = crate::shared::from_utf8(unescaped_slice)?; - Ok(String::Unescaped(str_content)) - } - - /// Helper to create a borrowed string from StreamBuffer - fn create_borrowed_string( - &mut self, - content_start: usize, - ) -> Result, ParseError> { - let current_pos = self.stream_buffer.current_position(); - let (content_start, content_end) = - ContentRange::string_content_bounds_from_content_start(content_start, current_pos); - - let bytes = self - .stream_buffer - .get_string_slice(content_start, content_end)?; - let str_content = crate::shared::from_utf8(bytes)?; - Ok(String::Borrowed(str_content)) - } - /// Start escape processing using StreamBuffer fn start_escape_processing(&mut self) -> Result<(), ParseError> { // Initialize escape processing with StreamBuffer if not already started @@ -220,21 +196,23 @@ impl ContentExtractor for StreamContentBuilder<'_, R> { } fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { - let string = if self.stream_buffer.has_unescaped_content() { - self.create_unescaped_string()? - } else { - self.create_borrowed_string(start_pos)? - }; - Ok(Event::String(string)) + // StreamParser-specific: Queue reset to prevent content contamination + if self.has_unescaped_content() { + self.queue_unescaped_reset(); + } + let current_pos = self.current_position(); + let content_piece = crate::shared::get_content_piece(self, start_pos, current_pos)?; + Ok(Event::String(content_piece.into_string()?)) } fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { - let key = if self.stream_buffer.has_unescaped_content() { - self.create_unescaped_string()? - } else { - self.create_borrowed_string(start_pos)? - }; - Ok(Event::Key(key)) + // StreamParser-specific: Queue reset to prevent content contamination + if self.has_unescaped_content() { + self.queue_unescaped_reset(); + } + let current_pos = self.current_position(); + let content_piece = crate::shared::get_content_piece(self, start_pos, current_pos)?; + Ok(Event::Key(content_piece.into_string()?)) } fn extract_number( @@ -243,18 +221,15 @@ impl ContentExtractor for StreamContentBuilder<'_, R> { from_container_end: bool, finished: bool, ) -> Result, ParseError> { - // Use shared number parsing with StreamParser-specific document end detection - // StreamParser uses state-based detection: finished flag indicates true document end - let current_pos = self.stream_buffer.current_position(); + let current_pos = self.current_position(); // A standalone number at the end of the document has no trailing delimiter, so we use the full span. let use_full_span = !from_container_end && finished; let end_pos = ContentRange::number_end_position(current_pos, use_full_span); - let number_bytes = self - .stream_buffer - .get_string_slice(start_pos, end_pos) - .map_err(ParseError::from)?; + // Use the DataSource trait method to get the number bytes + let number_bytes = self.get_borrowed_slice(start_pos, end_pos)?; + let json_number = JsonNumber::from_slice(number_bytes)?; Ok(Event::Number(json_number)) } @@ -282,19 +257,20 @@ impl ContentExtractor for StreamContentBuilder<'_, R> { } fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { - // Define the provider for getting hex digits from the stream buffer - let hex_slice_provider = |start, end| { - self.stream_buffer - .get_string_slice(start, end) - .map_err(Into::into) - }; + // Get pending surrogate before borrowing self + let pending_surrogate = self.unicode_escape_collector.get_pending_high_surrogate(); // Call the shared processor, which now returns the result by value - let (utf8_bytes_result, _) = crate::escape_processor::process_unicode_escape_sequence( - self.stream_buffer.current_position(), - &mut self.unicode_escape_collector, - hex_slice_provider, - )?; + let (utf8_bytes_result, _, new_pending_surrogate) = + crate::escape_processor::process_unicode_escape_sequence( + self.stream_buffer.current_position(), + pending_surrogate, + self, // Pass self as the DataSource + )?; + + // Update the collector's state + self.unicode_escape_collector + .set_pending_high_surrogate(new_pending_surrogate); // Handle the UTF-8 bytes if we have them if let Some((utf8_bytes, len)) = utf8_bytes_result { @@ -340,3 +316,25 @@ impl StreamContentBuilder<'_, R> { Ok(()) } } + +/// DataSource implementation for StreamContentBuilder +/// +/// This implementation provides access to both borrowed content from the StreamBuffer's +/// internal buffer and unescaped content from the StreamBuffer's scratch space. +/// Note: StreamParser doesn't have a distinct 'input lifetime since it reads from a stream, +/// so we use the buffer lifetime 'b for both borrowed and unescaped content. +impl<'b, R: Reader> DataSource<'b, 'b> for StreamContentBuilder<'b, R> { + fn get_borrowed_slice(&'b self, start: usize, end: usize) -> Result<&'b [u8], ParseError> { + self.stream_buffer + .get_string_slice(start, end) + .map_err(Into::into) + } + + fn get_unescaped_slice(&'b self) -> Result<&'b [u8], ParseError> { + self.stream_buffer.get_unescaped_slice().map_err(Into::into) + } + + fn has_unescaped_content(&self) -> bool { + self.stream_buffer.has_unescaped_content() + } +} diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index d0b739a..47968f9 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -1078,10 +1078,10 @@ mod tests { #[test] fn test_minimal_buffer_simple_escape_1() { - // Buffer size 4 - clearly not enough + // Buffer size 4 - token "hello\\" (8 bytes) too large for buffer (4 bytes) assert!(matches!( test_simple_escape_with_buffer_size(4), - Err(ParseError::ScratchBufferFull) + Err(ParseError::InputBufferFull) )); }