diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs index 2b9e513..c2077c7 100644 --- a/picojson/src/escape_processor.rs +++ b/picojson/src/escape_processor.rs @@ -271,6 +271,7 @@ impl UnicodeEscapeCollector { #[cfg(test)] mod tests { use super::*; + use crate::ujson::EventToken; #[test] fn test_simple_escapes() { @@ -375,8 +376,6 @@ mod tests { #[test] fn test_token_to_escape_char() { - use crate::ujson::EventToken; - // Test all valid escape tokens assert_eq!( EscapeProcessor::token_to_escape_char(&EventToken::EscapeQuote).unwrap(), @@ -420,8 +419,6 @@ mod tests { #[test] fn test_process_escape_token() { - use crate::ujson::EventToken; - // Test valid escape tokens that produce correct unescaped bytes assert_eq!( EscapeProcessor::process_escape_token(&EventToken::EscapeQuote).unwrap(), diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 39479ed..70e5083 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -5,6 +5,7 @@ //! This module extracts the common event handling patterns to reduce code duplication //! while preserving the performance characteristics of each parser type. +use crate::shared::{ContentRange, State}; use crate::ujson::EventToken; use crate::{Event, ParseError}; @@ -25,9 +26,9 @@ pub trait EscapeHandler { Ok(()) } - /// Append a single literal byte (for per-byte accumulation patterns) - /// Default implementation is no-op - suitable for parsers that don't need per-byte processing - fn append_literal_byte(&mut self, _byte: u8) -> Result<(), crate::ParseError> { + /// Begin unicode escape sequence processing + /// Default implementation is no-op - suitable for parsers that don't need special handling + fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { Ok(()) } } @@ -53,8 +54,6 @@ pub fn process_begin_events( event: &crate::ujson::Event, content_extractor: &mut C, ) -> Option> { - use crate::shared::{ContentRange, State}; - match event { // String/Key Begin events - nearly identical patterns crate::ujson::Event::Begin(EventToken::Key) => { @@ -132,6 +131,19 @@ pub trait ContentExtractor: EscapeHandler { from_container_end: bool, ) -> Result, crate::ParseError>; + /// Extract a completed number using shared number parsing logic + /// + /// # Arguments + /// * `start_pos` - Position where the number started + /// * `from_container_end` - True if number was terminated by container delimiter + /// * `finished` - True if the parser has finished processing input (StreamParser-specific) + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, crate::ParseError>; + /// Shared validation and extraction for string content fn validate_and_extract_string(&mut self) -> Result, crate::ParseError> { let start_pos = match *self.parser_state() { @@ -189,9 +201,9 @@ pub trait ContentExtractor: EscapeHandler { /// /// This callback stores tokenizer events in the parser's event array, filling the first /// available slot. This pattern is identical across both SliceParser and StreamParser. -pub fn create_tokenizer_callback<'a>( - event_storage: &'a mut [Option; 2], -) -> impl FnMut(crate::ujson::Event, usize) + 'a { +pub fn create_tokenizer_callback( + event_storage: &mut [Option; 2], +) -> impl FnMut(crate::ujson::Event, usize) + '_ { |event, _len| { for evt in event_storage.iter_mut() { if evt.is_none() { @@ -264,6 +276,7 @@ pub fn process_unicode_escape_events( match content_extractor.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { content_extractor.unicode_escape_collector_mut().reset(); + content_extractor.begin_unicode_escape()?; } _ => {} // Ignore if not in string/key context } @@ -471,6 +484,15 @@ mod tests { ) -> Result, crate::ParseError> { unimplemented!("Mock doesn't need extraction") } + + fn extract_number( + &mut self, + _start_pos: usize, + _from_container_end: bool, + _finished: bool, + ) -> Result, crate::ParseError> { + unimplemented!("Mock doesn't need extraction") + } } #[test] diff --git a/picojson/src/lib.rs b/picojson/src/lib.rs index 02fa922..f149eac 100644 --- a/picojson/src/lib.rs +++ b/picojson/src/lib.rs @@ -63,10 +63,16 @@ mod copy_on_escape; mod escape_processor; +mod parser_core; + mod stream_buffer; +mod stream_content_builder; + mod stream_parser; +mod slice_content_builder; + mod slice_parser; mod parse_error; diff --git a/picojson/src/parser_core.rs b/picojson/src/parser_core.rs new file mode 100644 index 0000000..8e9442c --- /dev/null +++ b/picojson/src/parser_core.rs @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Unified parser core that handles the common event processing loop. +//! +//! This module provides the `ParserCore` struct that consolidates the shared +//! event processing logic between SliceParser and StreamParser, eliminating +//! the duplication in their `next_event_impl` methods. + +use crate::event_processor::{ + finish_tokenizer, have_events, process_begin_escape_sequence_event, process_begin_events, + process_byte_through_tokenizer, process_simple_escape_event, process_simple_events, + process_unicode_escape_events, take_first_event, ContentExtractor, EventResult, +}; +use crate::shared::{ByteProvider, Event, ParserState, UnexpectedState}; +use crate::ujson::{EventToken, Tokenizer}; +use crate::{ujson, ParseError}; + +/// Combined trait for parsers that provide both byte access and content extraction +pub trait ParserProvider: ByteProvider + ContentExtractor {} +impl ParserProvider for T {} + +/// The core parser logic that handles the unified event processing loop. +/// +/// This struct contains all the shared state and logic that was previously +/// duplicated between SliceParser and StreamParser. It uses trait abstractions +/// to handle the differences in content building and byte providing. +pub struct ParserCore { + /// The tokenizer that processes JSON tokens + pub tokenizer: Tokenizer, + /// Parser state and event storage + pub parser_state: ParserState, +} + +impl ParserCore { + /// Create a new ParserCore + pub fn new() -> Self { + Self { + tokenizer: Tokenizer::new(), + parser_state: ParserState::new(), + } + } + + /// Unified implementation that works with a single combined provider. + /// This avoids borrowing conflicts by using a single object that implements both traits. + pub fn next_event_impl_unified<'a, P>( + &mut self, + provider: &'a mut P, + escape_timing: EscapeTiming, + ) -> Result, ParseError> + where + P: ParserProvider, + { + self.next_event_impl_unified_with_accumulator(provider, escape_timing, |_, _| Ok(())) + } + + /// Unified implementation with optional byte accumulation callback. + /// This supports StreamParser-specific byte accumulation when no events are generated. + pub fn next_event_impl_unified_with_accumulator<'a, P, F>( + &mut self, + provider: &'a mut P, + escape_timing: EscapeTiming, + mut byte_accumulator: F, + ) -> Result, ParseError> + where + P: ParserProvider, + F: FnMut(&mut P, u8) -> Result<(), ParseError>, + { + loop { + while !have_events(&self.parser_state.evts) { + if let Some(byte) = provider.next_byte()? { + process_byte_through_tokenizer( + byte, + &mut self.tokenizer, + &mut self.parser_state.evts, + )?; + + // Call byte accumulator if no events were generated (StreamParser-specific) + if !have_events(&self.parser_state.evts) { + byte_accumulator(provider, byte)?; + } + } else { + // Handle end of stream - let the provider handle any cleanup + // For StreamParser, this is where finished flag gets set + finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; + + if !have_events(&self.parser_state.evts) { + return Ok(Event::EndDocument); + } + } + } + + let taken_event = take_first_event(&mut self.parser_state.evts); + let Some(taken) = taken_event else { + return Err(UnexpectedState::StateMismatch.into()); + }; + + // Try shared event processors first + if let Some(result) = + process_simple_events(&taken).or_else(|| process_begin_events(&taken, provider)) + { + match result { + EventResult::Complete(event) => return Ok(event), + EventResult::ExtractString => return provider.validate_and_extract_string(), + EventResult::ExtractKey => return provider.validate_and_extract_key(), + EventResult::ExtractNumber(from_container_end) => { + return provider.validate_and_extract_number(from_container_end) + } + EventResult::Continue => continue, + } + } + + // Handle parser-specific events based on escape timing + match taken { + ujson::Event::Begin(EventToken::EscapeSequence) => { + process_begin_escape_sequence_event(provider)?; + } + _ if process_unicode_escape_events(&taken, provider)? => { + // Unicode escape events handled by shared function + } + ujson::Event::Begin( + escape_token @ (EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab), + ) if escape_timing == EscapeTiming::OnBegin => { + // SliceParser-specific: Handle simple escape sequences on Begin events + // because CopyOnEscape requires starting unescaping immediately when + // the escape token begins to maintain zero-copy optimization + process_simple_escape_event(&escape_token, provider)?; + } + ujson::Event::End( + escape_token @ (EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab), + ) if escape_timing == EscapeTiming::OnEnd => { + // StreamParser-specific: Handle simple escape sequences on End events + // because StreamBuffer must wait until the token ends to accumulate + // all bytes before processing the complete escape sequence + process_simple_escape_event(&escape_token, provider)?; + } + _ => { + // All other events continue to next iteration + } + } + } + } +} + +impl Default for ParserCore { + fn default() -> Self { + Self::new() + } +} + +/// Enum to specify when escape sequences should be processed +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum EscapeTiming { + /// Process simple escape sequences on Begin events (SliceParser) + OnBegin, + /// Process simple escape sequences on End events (StreamParser) + OnEnd, +} diff --git a/picojson/src/shared.rs b/picojson/src/shared.rs index 58befe2..74de2d1 100644 --- a/picojson/src/shared.rs +++ b/picojson/src/shared.rs @@ -55,14 +55,12 @@ pub enum State { /// Parser state and event storage pub(super) struct ParserState { - pub state: State, pub evts: [Option; 2], } impl ParserState { pub fn new() -> Self { Self { - state: State::None, evts: core::array::from_fn(|_| None), } } diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs new file mode 100644 index 0000000..1a21afd --- /dev/null +++ b/picojson/src/slice_content_builder.rs @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! ContentBuilder implementation for SliceParser using CopyOnEscape optimization. + +use crate::copy_on_escape::CopyOnEscape; +use crate::escape_processor::UnicodeEscapeCollector; +use crate::event_processor::{ContentExtractor, EscapeHandler}; +use crate::number_parser::NumberExtractor; +use crate::shared::{ContentRange, State}; +use crate::slice_input_buffer::{InputBuffer, SliceInputBuffer}; +use crate::ParseError; + +/// ContentBuilder implementation for SliceParser that uses CopyOnEscape for zero-copy optimization +pub struct SliceContentBuilder<'a, 'b> { + /// The input buffer for slice-based parsing + buffer: SliceInputBuffer<'a>, + /// Copy-on-escape handler for zero-copy string optimization + copy_on_escape: CopyOnEscape<'a, 'b>, + /// Parser state tracking + parser_state: State, + /// Unicode escape collector for \uXXXX sequences + unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to track when we're inside ANY escape sequence (like stream implementation) + in_escape_sequence: bool, +} + +impl<'a, 'b> SliceContentBuilder<'a, 'b> { + /// Create a new SliceContentBuilder + pub fn new(input: &'a [u8], scratch_buffer: &'b mut [u8]) -> Self { + Self { + buffer: SliceInputBuffer::new(input), + copy_on_escape: CopyOnEscape::new(input, scratch_buffer), + parser_state: State::None, + unicode_escape_collector: UnicodeEscapeCollector::new(), + in_escape_sequence: false, + } + } + + /// Get access to the input buffer for byte operations + pub fn buffer(&self) -> &SliceInputBuffer<'a> { + &self.buffer + } + + /// Get mutable access to the input buffer for byte operations + pub fn buffer_mut(&mut self) -> &mut SliceInputBuffer<'a> { + &mut self.buffer + } +} + +impl ContentExtractor for SliceContentBuilder<'_, '_> { + fn parser_state_mut(&mut self) -> &mut crate::shared::State { + &mut self.parser_state + } + + fn current_position(&self) -> usize { + self.buffer.current_pos() + } + + fn begin_string_content(&mut self, pos: usize) { + self.copy_on_escape.begin_string(pos); + } + + fn unicode_escape_collector_mut(&mut self) -> &mut UnicodeEscapeCollector { + &mut self.unicode_escape_collector + } + + fn extract_string_content( + &mut self, + _start_pos: usize, + ) -> Result, ParseError> { + let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); + let value_result = self.copy_on_escape.end_string(end_pos)?; + Ok(crate::Event::String(value_result)) + } + + fn extract_key_content( + &mut self, + _start_pos: usize, + ) -> Result, ParseError> { + let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); + let key_result = self.copy_on_escape.end_string(end_pos)?; + Ok(crate::Event::Key(key_result)) + } + + fn extract_number_content( + &mut self, + start_pos: usize, + from_container_end: bool, + ) -> Result, ParseError> { + // For standalone numbers at document end, clamp end position to buffer bounds + let at_document_end = self.buffer.is_empty(); + let current_pos = self.buffer.current_pos(); + let use_full_span = !from_container_end && at_document_end; + + let end_pos = if use_full_span { + // Standalone number: clamp to buffer length to prevent slice bounds errors + core::cmp::min(current_pos, self.buffer.data_len()) + } else { + // Container number: exclude delimiter + current_pos.saturating_sub(1) + }; + + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) + } + + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + // For SliceParser, use buffer-based document end detection + // The finished parameter should always be true for complete slices, but we don't rely on it + let at_document_end = self.buffer.is_empty(); + let current_pos = self.buffer.current_pos(); + let use_full_span = !from_container_end && at_document_end; + + let end_pos = if use_full_span { + // Standalone number: clamp to buffer length to prevent slice bounds errors + core::cmp::min(current_pos, self.buffer.data_len()) + } else { + // Container number: exclude delimiter + current_pos.saturating_sub(1) + }; + + // Note: finished parameter is ignored for SliceParser as slices are always complete + let _ = finished; // Explicitly acknowledge parameter while explaining why it's unused + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) + } +} + +impl crate::shared::ByteProvider for SliceContentBuilder<'_, '_> { + fn next_byte(&mut self) -> Result, crate::ParseError> { + match self.buffer_mut().consume_byte() { + Ok(byte) => Ok(Some(byte)), + Err(crate::slice_input_buffer::Error::ReachedEnd) => Ok(None), + Err(err) => Err(err.into()), + } + } +} + +impl EscapeHandler for SliceContentBuilder<'_, '_> { + fn parser_state(&self) -> &crate::shared::State { + &self.parser_state + } + + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Clear the escape sequence flag when unicode escape completes + self.in_escape_sequence = false; + let current_pos = self.buffer.current_pos(); + let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); + + // Shared Unicode escape processing pattern + let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); + + let mut utf8_buf = [0u8; 4]; + let (utf8_bytes_opt, escape_start_pos) = + crate::escape_processor::process_unicode_escape_sequence( + current_pos, + &mut self.unicode_escape_collector, + hex_slice_provider, + &mut utf8_buf, + )?; + // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) + if let Some(utf8_bytes) = utf8_bytes_opt { + if had_pending_high_surrogate { + // This is completing a surrogate pair - need to consume both escapes + // First call: consume the high surrogate (6 bytes earlier) + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, &[])?; + // Second call: consume the low surrogate and write UTF-8 + self.copy_on_escape + .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + } else { + // Single Unicode escape - normal processing + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + } + } + + Ok(()) + } + + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + self.copy_on_escape + .handle_escape(self.buffer.current_pos(), escape_char)?; + Ok(()) + } + + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + // Set escape flag to prevent literal byte accumulation during escape processing + self.in_escape_sequence = true; + Ok(()) + } +} diff --git a/picojson/src/slice_input_buffer.rs b/picojson/src/slice_input_buffer.rs index 0b343f4..4a4ee9c 100644 --- a/picojson/src/slice_input_buffer.rs +++ b/picojson/src/slice_input_buffer.rs @@ -54,6 +54,11 @@ impl<'a> SliceInputBuffer<'a> { pub fn slice(&self, start: usize, end: usize) -> Result<&'a [u8], Error> { self.data.get(start..end).ok_or(Error::InvalidSliceBounds) } + + /// Gets the length of the underlying data for bounds checking. + pub fn data_len(&self) -> usize { + self.data.len() + } } impl crate::number_parser::NumberExtractor for SliceInputBuffer<'_> { diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index 7357daa..088949b 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -1,20 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 -use crate::copy_on_escape::CopyOnEscape; -use crate::escape_processor::UnicodeEscapeCollector; -use crate::event_processor::{ - finish_tokenizer, have_events, process_begin_escape_sequence_event, process_begin_events, - process_byte_through_tokenizer, process_simple_escape_event, process_simple_events, - process_unicode_escape_events, take_first_event, ContentExtractor, EscapeHandler, EventResult, -}; -use crate::number_parser::NumberExtractor; use crate::parse_error::ParseError; -use crate::shared::{ - ByteProvider, ContentRange, Event, ParserState, PullParser, State, UnexpectedState, -}; -use crate::slice_input_buffer::{InputBuffer, SliceInputBuffer}; +use crate::parser_core::ParserCore; +use crate::shared::{Event, PullParser}; +use crate::slice_content_builder::SliceContentBuilder; +use crate::slice_input_buffer::InputBuffer; use crate::ujson; -use ujson::{EventToken, Tokenizer}; use ujson::{BitStackConfig, DefaultConfig}; @@ -24,12 +15,10 @@ use ujson::{BitStackConfig, DefaultConfig}; // Lifetime 'a is the input buffer lifetime // lifetime 'b is the scratch/copy buffer lifetime pub struct SliceParser<'a, 'b, C: BitStackConfig = DefaultConfig> { - tokenizer: Tokenizer, - buffer: SliceInputBuffer<'a>, - parser_state: ParserState, - copy_on_escape: CopyOnEscape<'a, 'b>, - /// Shared Unicode escape collector for \uXXXX sequences - unicode_escape_collector: UnicodeEscapeCollector, + /// The shared parser core that handles the unified event processing loop + parser_core: ParserCore, + /// The content builder that handles SliceParser-specific content extraction + content_builder: SliceContentBuilder<'a, 'b>, } /// Methods for the pull parser. @@ -147,213 +136,35 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { input: &'a [u8], scratch_buffer: &'b mut [u8], ) -> Self { - let copy_on_escape = CopyOnEscape::new(input, scratch_buffer); SliceParser { - tokenizer: Tokenizer::new(), - buffer: SliceInputBuffer::new(input), - parser_state: ParserState::new(), - copy_on_escape, - unicode_escape_collector: UnicodeEscapeCollector::new(), + parser_core: ParserCore::new(), + content_builder: SliceContentBuilder::new(input, scratch_buffer), } } /// Returns the next JSON event or an error if parsing fails. /// Parsing continues until `EndDocument` is returned or an error occurs. fn next_event_impl(&mut self) -> Result, ParseError> { - if self.buffer.is_past_end() { - return Ok(Event::EndDocument); - } - - loop { - while !have_events(&self.parser_state.evts) { - if !self.pull_tokenizer_events()? { - return Ok(Event::EndDocument); - } - } - - let taken_event = take_first_event(&mut self.parser_state.evts); - let Some(taken) = taken_event else { - return Err(UnexpectedState::StateMismatch.into()); - }; - - // Try shared event processors first - if let Some(result) = - process_simple_events(&taken).or_else(|| process_begin_events(&taken, self)) - { - match result { - EventResult::Complete(event) => return Ok(event), - EventResult::ExtractString => return self.validate_and_extract_string(), - EventResult::ExtractKey => return self.validate_and_extract_key(), - EventResult::ExtractNumber(from_container_end) => { - return self.validate_and_extract_number(from_container_end) - } - EventResult::Continue => continue, - } - } - - // Handle parser-specific events - match taken { - ujson::Event::Begin(EventToken::EscapeSequence) => { - process_begin_escape_sequence_event(self)?; - } - _ if process_unicode_escape_events(&taken, self)? => { - // Unicode escape events handled by shared function - } - ujson::Event::Begin( - escape_token @ (EventToken::EscapeQuote - | EventToken::EscapeBackslash - | EventToken::EscapeSlash - | EventToken::EscapeBackspace - | EventToken::EscapeFormFeed - | EventToken::EscapeNewline - | EventToken::EscapeCarriageReturn - | EventToken::EscapeTab), - ) => { - // SliceParser-specific: Handle simple escape sequences on Begin events - // because CopyOnEscape requires starting unescaping immediately when - // the escape token begins to maintain zero-copy optimization - process_simple_escape_event(&escape_token, self)?; - } - ujson::Event::End(EventToken::EscapeSequence) => { - // Ignore in SliceParser since it uses slice-based parsing - } - _ => { - // All other events continue to next iteration - } - } - } - } - - /// Pull events from tokenizer and return whether parsing should continue - /// Returns false when past end (equivalent to self.buffer.is_past_end()) - fn pull_tokenizer_events(&mut self) -> Result { - if self.buffer.is_past_end() { - return Ok(false); // Indicate end state instead of error - } - // Use ByteProvider implementation to get the next byte and process it - if let Some(byte) = self.next_byte()? { - process_byte_through_tokenizer(byte, &mut self.tokenizer, &mut self.parser_state.evts)?; - } else { - finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; - } - Ok(!self.buffer.is_past_end()) // Return continue state - } -} - -impl<'a, 'b, C: BitStackConfig> ContentExtractor for SliceParser<'a, 'b, C> { - fn parser_state_mut(&mut self) -> &mut State { - &mut self.parser_state.state - } - - fn current_position(&self) -> usize { - self.buffer.current_pos() - } - - fn begin_string_content(&mut self, pos: usize) { - self.copy_on_escape.begin_string(pos); - } - - fn unicode_escape_collector_mut( - &mut self, - ) -> &mut crate::escape_processor::UnicodeEscapeCollector { - &mut self.unicode_escape_collector - } - - fn extract_string_content(&mut self, _start_pos: usize) -> Result, ParseError> { - // Use CopyOnEscape to get the final string result - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let value_result = self.copy_on_escape.end_string(end_pos)?; - Ok(Event::String(value_result)) - } - - fn extract_key_content(&mut self, _start_pos: usize) -> Result, ParseError> { - // Use CopyOnEscape to get the final key result - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let key_result = self.copy_on_escape.end_string(end_pos)?; - Ok(Event::Key(key_result)) - } - - fn extract_number_content( - &mut self, - start_pos: usize, - from_container_end: bool, - ) -> Result, ParseError> { - // Use shared number parsing with SliceParser-specific document end detection - let at_document_end = self.buffer.is_empty(); - crate::number_parser::parse_number_with_delimiter_logic( - &self.buffer, - start_pos, - from_container_end, - at_document_end, + // Use the unified ParserCore implementation with SliceParser-specific timing + self.parser_core.next_event_impl_unified( + &mut self.content_builder, + crate::parser_core::EscapeTiming::OnBegin, ) } } -impl<'a, 'b, C: BitStackConfig> EscapeHandler for SliceParser<'a, 'b, C> { - fn parser_state(&self) -> &State { - &self.parser_state.state - } - - fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { - let current_pos = self.buffer.current_pos(); - let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); - - // Shared Unicode escape processing pattern - let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); - - let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, escape_start_pos) = - crate::escape_processor::process_unicode_escape_sequence( - current_pos, - &mut self.unicode_escape_collector, - hex_slice_provider, - &mut utf8_buf, - )?; - - // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) - if let Some(utf8_bytes) = utf8_bytes_opt { - if had_pending_high_surrogate { - // This is completing a surrogate pair - need to consume both escapes - // First call: consume the high surrogate (6 bytes earlier) - self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; - // Second call: consume the low surrogate and write UTF-8 - self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; - } else { - // Single Unicode escape - normal processing - self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; - } - } - - Ok(()) - } - - fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { - self.copy_on_escape - .handle_escape(self.buffer.current_pos(), escape_char)?; - Ok(()) - } - - /// Append a single literal byte - implement as single-byte range for consistency - fn append_literal_byte(&mut self, _byte: u8) -> Result<(), ParseError> { - // SliceParser doesn't typically need per-byte processing since it works with ranges - // This could be implemented as a single-byte range if needed, but for now it's a no-op - Ok(()) - } -} - -impl<'a, 'b, C: BitStackConfig> PullParser for SliceParser<'a, 'b, C> { +impl PullParser for SliceParser<'_, '_, C> { fn next_event(&mut self) -> Result, ParseError> { + if self.content_builder.buffer().is_past_end() { + return Ok(Event::EndDocument); + } self.next_event_impl() } } -impl<'a, 'b, C: BitStackConfig> crate::shared::ByteProvider for SliceParser<'a, 'b, C> { +impl crate::shared::ByteProvider for SliceParser<'_, '_, C> { fn next_byte(&mut self) -> Result, ParseError> { - use crate::slice_input_buffer::InputBuffer; - match self.buffer.consume_byte() { + match self.content_builder.buffer_mut().consume_byte() { Ok(byte) => Ok(Some(byte)), Err(crate::slice_input_buffer::Error::ReachedEnd) => Ok(None), Err(err) => Err(err.into()), @@ -676,6 +487,7 @@ mod tests { #[test] fn test_unicode_escape_integration() { let input = r#"{"key": "Hello\u0041World"}"#; // \u0041 = 'A' + let mut scratch = [0u8; 1024]; let mut parser = SliceParser::with_buffer(input, &mut scratch); diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index ead407d..18b0932 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -146,10 +146,14 @@ impl<'a> StreamBuffer<'a> { return Err(StreamBufferError::BufferFull); } - let src_range = start_offset..start_offset.wrapping_add(span_len); - if src_range.end > self.buffer.len() { + let src_range_end = start_offset + .checked_add(span_len) + .ok_or(StreamBufferError::InvalidSliceBounds)?; + + if src_range_end > self.buffer.len() { return Err(StreamBufferError::InvalidSliceBounds); } + let src_range = start_offset..src_range_end; // Copy within the same buffer: move data from [start_offset..end] to [0..span_len] // Use our panic-free copy implementation @@ -268,6 +272,7 @@ impl<'a> StreamBuffer<'a> { #[cfg(test)] mod tests { use super::*; + use crate::shared::State; #[test] fn test_lifetime_expectations() { @@ -718,7 +723,6 @@ mod tests { if _string_start_pos < offset { // Original string start was discarded - must use escape/copy mode // In real implementation, parser would copy what it had processed to unescaped buffer - println!("String start was discarded, switching to escape mode"); _string_start_pos = 0; // Reset for escape mode } else { _string_start_pos = _string_start_pos.saturating_sub(offset); // Normal position update @@ -917,9 +921,6 @@ mod tests { fn test_position_update_state_transitions() { // Test the complete state transition logic for different parser states - // Mock the State enum variants and position update logic - use crate::shared::State; - // Case 1: State::None - no position to update let state = State::None; // No position updates needed for None state diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs new file mode 100644 index 0000000..c163815 --- /dev/null +++ b/picojson/src/stream_content_builder.rs @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! ContentBuilder implementation for StreamParser using StreamBuffer. + +use crate::escape_processor::UnicodeEscapeCollector; +use crate::event_processor::{ContentExtractor, EscapeHandler}; +use crate::shared::{ContentRange, State}; +use crate::stream_buffer::StreamBuffer; +use crate::{Event, ParseError, String}; + +/// ContentBuilder implementation for StreamParser that uses StreamBuffer for streaming and escape processing +pub struct StreamContentBuilder<'b> { + /// StreamBuffer for single-buffer input and escape processing + stream_buffer: StreamBuffer<'b>, + /// Parser state tracking + parser_state: State, + /// Unicode escape collector for \uXXXX sequences + unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to reset unescaped content on next operation + unescaped_reset_queued: bool, + /// Flag to track when we're inside a Unicode escape sequence (collecting hex digits) + in_unicode_escape: bool, + /// Flag to track when we're inside ANY escape sequence (like old implementation) + in_escape_sequence: bool, + /// Flag to track when the input stream has been finished (for number parsing) + finished: bool, +} + +impl<'b> StreamContentBuilder<'b> { + /// Create a new StreamContentBuilder + pub fn new(buffer: &'b mut [u8]) -> Self { + Self { + stream_buffer: StreamBuffer::new(buffer), + parser_state: State::None, + unicode_escape_collector: UnicodeEscapeCollector::new(), + unescaped_reset_queued: false, + in_unicode_escape: false, + in_escape_sequence: false, + finished: false, + } + } + + /// Fill the buffer from a reader + pub fn fill_buffer_from_reader( + &mut self, + reader: &mut R, + ) -> Result<(), crate::ParseError> { + // If buffer is full, try to compact it first (original compaction logic) + if self.stream_buffer.get_fill_slice().is_none() { + // Buffer is full - ALWAYS attempt compaction + let compact_start_pos = match self.parser_state { + crate::shared::State::Number(start_pos) => start_pos, + crate::shared::State::Key(start_pos) => start_pos, + crate::shared::State::String(start_pos) => start_pos, + _ => self.stream_buffer.current_position(), + }; + + let compaction_offset = self + .stream_buffer + .compact_from(compact_start_pos) + .map_err(crate::ParseError::from)?; + + if compaction_offset == 0 { + // SOL: Buffer too small for current token + return Err(crate::ParseError::ScratchBufferFull); + } + + // Update parser state positions after compaction (original logic) + self.update_positions_after_compaction(compaction_offset)?; + } + + if let Some(fill_slice) = self.stream_buffer.get_fill_slice() { + let bytes_read = reader + .read(fill_slice) + .map_err(|_| crate::ParseError::ReaderError)?; + self.stream_buffer + .mark_filled(bytes_read) + .map_err(crate::ParseError::from)?; + } + Ok(()) + } + + /// Update parser state positions after compaction (original logic) + fn update_positions_after_compaction( + &mut self, + compaction_offset: usize, + ) -> Result<(), crate::ParseError> { + // Update positions - since we compact from the token start position, + // positions should not be discarded in normal operation + match &mut self.parser_state { + crate::shared::State::None => { + // No position-based state to update + } + crate::shared::State::Key(pos) => { + if *pos >= compaction_offset { + *pos = pos.checked_sub(compaction_offset).unwrap_or(0); + } else { + // This shouldn't happen since we compact from the token start + *pos = 0; + } + } + crate::shared::State::String(pos) => { + if *pos >= compaction_offset { + *pos = pos.checked_sub(compaction_offset).unwrap_or(0); + } else { + // This shouldn't happen since we compact from the token start + *pos = 0; + } + } + crate::shared::State::Number(pos) => { + if *pos >= compaction_offset { + *pos = pos.checked_sub(compaction_offset).unwrap_or(0); + } else { + // This shouldn't happen since we compact from the token start + *pos = 0; + } + } + } + Ok(()) + } + + /// Get access to the stream buffer for byte operations + pub fn stream_buffer(&self) -> &StreamBuffer<'b> { + &self.stream_buffer + } + + /// Get mutable access to the stream buffer for byte operations + pub fn stream_buffer_mut(&mut self) -> &mut StreamBuffer<'b> { + &mut self.stream_buffer + } + + /// Set the finished state (called by StreamParser when input is exhausted) + pub fn set_finished(&mut self, finished: bool) { + self.finished = finished; + } + + /// Apply queued unescaped content reset if flag is set + pub fn apply_unescaped_reset_if_queued(&mut self) { + if self.unescaped_reset_queued { + self.stream_buffer.clear_unescaped(); + self.unescaped_reset_queued = false; + } + } + + /// Queue a reset of unescaped content for the next operation + fn queue_unescaped_reset(&mut self) { + self.unescaped_reset_queued = true; + } + + /// Helper to create an unescaped string from StreamBuffer + fn create_unescaped_string(&mut self) -> Result, ParseError> { + self.queue_unescaped_reset(); + let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; + let str_content = crate::shared::from_utf8(unescaped_slice)?; + Ok(String::Unescaped(str_content)) + } + + /// Helper to create a borrowed string from StreamBuffer + fn create_borrowed_string( + &mut self, + content_start: usize, + ) -> Result, ParseError> { + let current_pos = self.stream_buffer.current_position(); + let (content_start, content_end) = + ContentRange::string_content_bounds_from_content_start(content_start, current_pos); + + let bytes = self + .stream_buffer + .get_string_slice(content_start, content_end)?; + let str_content = crate::shared::from_utf8(bytes)?; + Ok(String::Borrowed(str_content)) + } + + /// Start escape processing using StreamBuffer + fn start_escape_processing(&mut self) -> Result<(), ParseError> { + // Initialize escape processing with StreamBuffer if not already started + if !self.stream_buffer.has_unescaped_content() { + if let State::String(start_pos) | State::Key(start_pos) = self.parser_state { + let current_pos = self.stream_buffer.current_position(); + + // start_pos already points to content start position (not quote position) + let content_start = start_pos; + // Content to copy ends right before the escape character + let content_end = if self.unicode_escape_collector.has_pending_high_surrogate() { + // Skip copying high surrogate text when processing low surrogate + content_start + } else { + ContentRange::end_position_excluding_delimiter(current_pos) + }; + + // Estimate max length needed for unescaping (content so far + remaining buffer) + let content_len = content_end.wrapping_sub(content_start); + let max_escaped_len = self + .stream_buffer + .remaining_bytes() + .checked_add(content_len) + .ok_or(ParseError::NumericOverflow)?; + + // Start unescaping with StreamBuffer and copy existing content + self.stream_buffer.start_unescaping_with_copy( + max_escaped_len, + content_start, + content_end, + )?; + } + } + + Ok(()) + } +} + +impl ContentExtractor for StreamContentBuilder<'_> { + fn parser_state_mut(&mut self) -> &mut State { + &mut self.parser_state + } + + fn current_position(&self) -> usize { + self.stream_buffer.current_position() + } + + fn begin_string_content(&mut self, _pos: usize) { + // StreamParser doesn't need explicit string begin processing + // as it handles content accumulation automatically + } + + fn unicode_escape_collector_mut(&mut self) -> &mut UnicodeEscapeCollector { + &mut self.unicode_escape_collector + } + + fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { + let string = if self.stream_buffer.has_unescaped_content() { + self.create_unescaped_string()? + } else { + self.create_borrowed_string(start_pos)? + }; + Ok(Event::String(string)) + } + + fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { + let key = if self.stream_buffer.has_unescaped_content() { + self.queue_unescaped_reset(); + let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; + let str_content = crate::shared::from_utf8(unescaped_slice)?; + String::Unescaped(str_content) + } else { + self.create_borrowed_string(start_pos)? + }; + Ok(Event::Key(key)) + } + + fn extract_number_content( + &mut self, + start_pos: usize, + from_container_end: bool, + ) -> Result, ParseError> { + // Use shared number parsing with StreamParser-specific document end detection + // StreamParser uses state-based detection: finished flag indicates true document end + let at_document_end = self.finished; + crate::number_parser::parse_number_with_delimiter_logic( + &self.stream_buffer, + start_pos, + from_container_end, + at_document_end, + ) + } + + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + // Use shared number parsing with StreamParser-specific document end detection + // StreamParser uses state-based detection: finished flag indicates true document end + let at_document_end = finished; + crate::number_parser::parse_number_with_delimiter_logic( + &self.stream_buffer, + start_pos, + from_container_end, + at_document_end, + ) + } +} + +impl crate::shared::ByteProvider for StreamContentBuilder<'_> { + fn next_byte(&mut self) -> Result, crate::ParseError> { + // This implementation doesn't have access to the reader + // It relies on StreamParser to fill the buffer before calling the unified method + + // If buffer is empty, cannot provide bytes + if self.stream_buffer.is_empty() { + return Ok(None); + } + + // Get byte and advance + let byte = self.stream_buffer.current_byte()?; + self.stream_buffer.advance()?; + + Ok(Some(byte)) + } +} + +impl StreamContentBuilder<'_> { + /// Handle byte accumulation for StreamParser-specific requirements + /// This method is called when a byte doesn't generate any events + pub fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), crate::ParseError> { + // Check if we're in a string or key state and should accumulate bytes + let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); + + if in_string_mode { + // When unescaped content is active, we need to accumulate ALL string content + // This includes both regular characters and content after escape sequences + if self.stream_buffer.has_unescaped_content() { + // Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && !self.unicode_escape_collector.has_pending_high_surrogate() + { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(crate::ParseError::from)?; + } + } + } + Ok(()) + } +} + +impl EscapeHandler for StreamContentBuilder<'_> { + fn parser_state(&self) -> &State { + &self.parser_state + } + + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { + // Called when Begin(UnicodeEscape) is received + self.in_unicode_escape = true; + self.in_escape_sequence = true; + Ok(()) + } + + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Reset the escape flags + self.in_unicode_escape = false; + self.in_escape_sequence = false; + // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts + let utf8_bytes_result = { + let current_pos = self.stream_buffer.current_position(); + let hex_slice_provider = |start, end| { + self.stream_buffer + .get_string_slice(start, end) + .map_err(Into::into) + }; + + let mut utf8_buf = [0u8; 4]; + let (utf8_bytes_opt, _) = crate::escape_processor::process_unicode_escape_sequence( + current_pos, + &mut self.unicode_escape_collector, + hex_slice_provider, + &mut utf8_buf, + )?; + // Copy UTF-8 bytes to avoid borrow conflicts + utf8_bytes_opt.map(|bytes| { + let mut copy = [0u8; 4]; + let len = bytes.len(); + if let Some(dest) = copy.get_mut(..len) { + dest.copy_from_slice(bytes); + } + (copy, len) + }) + }; + + // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) + if let Some((utf8_bytes, len)) = utf8_bytes_result { + // StreamParser handles all escape sequences the same way - append bytes to escape buffer + // Use safe slice access to avoid panic + if let Some(valid_bytes) = utf8_bytes.get(..len) { + for &byte in valid_bytes { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(ParseError::from)?; + } + } + } + + Ok(()) + } + + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + self.stream_buffer + .append_unescaped_byte(escape_char) + .map_err(ParseError::from) + } + + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + self.in_escape_sequence = true; + self.start_escape_processing() + } +} diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index 1542d9e..f04885e 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -1,16 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 -use crate::escape_processor::UnicodeEscapeCollector; -use crate::event_processor::{ - finish_tokenizer, have_events, process_begin_escape_sequence_event, process_begin_events, - process_byte_through_tokenizer, process_simple_escape_event, process_simple_events, - process_unicode_escape_events, take_first_event, ContentExtractor, EscapeHandler, EventResult, -}; +use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::parse_error::ParseError; -use crate::shared::{ByteProvider, ContentRange, Event, ParserState}; -use crate::stream_buffer::StreamBuffer; +use crate::parser_core::ParserCore; +use crate::shared::{ByteProvider, Event, State}; +use crate::stream_content_builder::StreamContentBuilder; use crate::{ujson, PullParser}; -use ujson::{EventToken, Tokenizer}; use ujson::{BitStackConfig, DefaultConfig}; @@ -29,39 +24,17 @@ pub trait Reader { fn read(&mut self, buf: &mut [u8]) -> Result; } -/// Represents the processing state of the StreamParser -/// Enforces logical invariants: once Finished, no other processing states are possible -#[derive(Debug)] -enum ProcessingState { - /// Normal active processing - Active { - unescaped_reset_queued: bool, - in_escape_sequence: bool, - }, - /// All input consumed, tokenizer finished - Finished, -} - /// A pull parser that parses JSON from a stream. /// /// Generic over BitStackConfig for configurable nesting depth. /// It is designed to be used with the [Reader] trait, which is used to read data from a stream. /// pub struct StreamParser<'b, R: Reader, C: BitStackConfig = DefaultConfig> { - /// The tokenizer that processes JSON tokens - tokenizer: Tokenizer, - /// Parser state tracking - parser_state: ParserState, - /// Reader for streaming input - reader: R, - /// StreamBuffer for single-buffer input and escape processing - stream_buffer: StreamBuffer<'b>, - - /// Processing state machine that enforces logical invariants - processing_state: ProcessingState, - - /// Shared Unicode escape collector for \uXXXX sequences - unicode_escape_collector: UnicodeEscapeCollector, + /// The shared parser core that handles the unified event processing loop + parser_core: ParserCore, + /// The unified provider that handles both content building and reader access + /// This allows us to use the same unified pattern as SliceParser + provider: StreamParserProvider<'b, R>, } /// Methods for StreamParser using DefaultConfig @@ -96,434 +69,106 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> { /// ``` pub fn with_config(reader: R, buffer: &'b mut [u8]) -> Self { Self { - tokenizer: Tokenizer::new(), - parser_state: ParserState::new(), - reader, - stream_buffer: StreamBuffer::new(buffer), - - // Initialize new state machine to Active with default values - processing_state: ProcessingState::Active { - unescaped_reset_queued: false, - in_escape_sequence: false, - }, - - unicode_escape_collector: UnicodeEscapeCollector::new(), + parser_core: ParserCore::new(), + provider: StreamParserProvider::new(reader, buffer), } } } -/// Shared methods for StreamParser with any BitStackConfig -impl StreamParser<'_, R, C> { - /// Get the next JSON event from the stream - fn next_event_impl(&mut self) -> Result, ParseError> { - self.apply_unescaped_reset_if_queued(); - - loop { - while !have_events(&self.parser_state.evts) { - if !self.pull_tokenizer_events()? { - return Ok(Event::EndDocument); - } - } - - let taken_event = take_first_event(&mut self.parser_state.evts); - let Some(taken) = taken_event else { - return Err(crate::shared::UnexpectedState::StateMismatch.into()); - }; - - // Try shared event processors first - if let Some(result) = - process_simple_events(&taken).or_else(|| process_begin_events(&taken, self)) - { - match result { - EventResult::Complete(event) => return Ok(event), - EventResult::ExtractString => return self.validate_and_extract_string(), - EventResult::ExtractKey => return self.validate_and_extract_key(), - EventResult::ExtractNumber(from_container_end) => { - return self.validate_and_extract_number(from_container_end) - } - EventResult::Continue => continue, - } - } - - // Handle parser-specific events - match taken { - ujson::Event::Begin(EventToken::EscapeSequence) => { - process_begin_escape_sequence_event(self)?; - } - _ if process_unicode_escape_events(&taken, self)? => { - // Unicode escape events handled by shared function - } - ujson::Event::End( - escape_token @ (EventToken::EscapeQuote - | EventToken::EscapeBackslash - | EventToken::EscapeSlash - | EventToken::EscapeBackspace - | EventToken::EscapeFormFeed - | EventToken::EscapeNewline - | EventToken::EscapeCarriageReturn - | EventToken::EscapeTab), - ) => { - // StreamParser-specific: Handle simple escape sequences on End events - // because StreamBuffer must wait until the token ends to accumulate - // all bytes before processing the complete escape sequence - process_simple_escape_event(&escape_token, self)?; - } - _ => { - // All other events continue to next iteration - } - } - } - } - - /// Pull events from tokenizer and return whether parsing should continue - /// Returns false when finished (equivalent to ProcessingState::Finished) - fn pull_tokenizer_events(&mut self) -> Result { - if let Some(byte) = self.next_byte()? { - // Process byte through tokenizer using shared logic - process_byte_through_tokenizer(byte, &mut self.tokenizer, &mut self.parser_state.evts)?; - - // Handle byte accumulation if no event was generated (StreamParser-specific) - if !have_events(&self.parser_state.evts) { - self.handle_byte_accumulation(byte)?; - } - } else { - // Handle end of data with tokenizer finish - if !matches!(self.processing_state, ProcessingState::Finished) { - self.processing_state = ProcessingState::Finished; - - // Use shared logic for finish - finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; - } - - if !have_events(&self.parser_state.evts) { - return Ok(false); // Signal end of parsing - } - // Continue to process any events generated by finish() - } - Ok(true) // Continue parsing - } - - /// Helper to create an unescaped string from StreamBuffer - fn create_unescaped_string(&mut self) -> Result, ParseError> { - self.queue_unescaped_reset(); - let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; - let str_content = crate::shared::from_utf8(unescaped_slice)?; - Ok(Event::String(crate::String::Unescaped(str_content))) - } - - /// Helper to create a borrowed string from StreamBuffer - fn create_borrowed_string( - &mut self, - content_start: usize, - ) -> Result, ParseError> { - let current_pos = self.stream_buffer.current_position(); - let (content_start, content_end) = - ContentRange::string_content_bounds_from_content_start(content_start, current_pos); - - let bytes = self - .stream_buffer - .get_string_slice(content_start, content_end)?; - let str_content = crate::shared::from_utf8(bytes)?; - Ok(Event::String(crate::String::Borrowed(str_content))) - } - - /// Helper to create an unescaped key from StreamBuffer - fn create_unescaped_key(&mut self) -> Result, ParseError> { - self.queue_unescaped_reset(); - let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; - let str_content = crate::shared::from_utf8(unescaped_slice)?; - Ok(Event::Key(crate::String::Unescaped(str_content))) - } - - /// Helper to create a borrowed key from StreamBuffer - fn create_borrowed_key(&mut self, content_start: usize) -> Result, ParseError> { - let current_pos = self.stream_buffer.current_position(); - let (content_start, content_end) = - ContentRange::string_content_bounds_from_content_start(content_start, current_pos); - - let bytes = self - .stream_buffer - .get_string_slice(content_start, content_end)?; - let str_content = crate::shared::from_utf8(bytes)?; - Ok(Event::Key(crate::String::Borrowed(str_content))) - } - - /// Fill buffer from reader - fn fill_buffer_from_reader(&mut self) -> Result<(), ParseError> { - if let Some(fill_slice) = self.stream_buffer.get_fill_slice() { - let bytes_read = self - .reader - .read(fill_slice) - .map_err(|_| ParseError::ReaderError)?; - - self.stream_buffer.mark_filled(bytes_read)?; - } else { - // Buffer is full - ALWAYS attempt compaction - let compact_start_pos = match self.parser_state.state { - crate::shared::State::Number(start_pos) => start_pos, - crate::shared::State::Key(start_pos) => start_pos, - crate::shared::State::String(start_pos) => start_pos, - _ => self.stream_buffer.current_position(), - }; - - let offset = self.stream_buffer.compact_from(compact_start_pos)?; - - if offset == 0 { - // SOL: Buffer too small for current token - return Err(ParseError::ScratchBufferFull); - } - - // Update parser state positions - self.update_positions_after_compaction(offset)?; - - // Try to fill again after compaction - if let Some(fill_slice) = self.stream_buffer.get_fill_slice() { - let bytes_read = self - .reader - .read(fill_slice) - .map_err(|_| ParseError::ReaderError)?; +/// Implement the required traits for StreamParser to work with unified ParserCore +/// This provider handles the StreamParser-specific operations needed by the unified parser core +/// It bridges the gap between the generic ParserCore and the StreamParser's specific requirements +/// for streaming input and buffer management +/// The provider contains mutable references to the StreamParser's internal state +/// which allows the unified parser core to control the parsing process +pub struct StreamParserProvider<'b, R: Reader> { + content_builder: StreamContentBuilder<'b>, + reader: R, + finished: bool, +} - self.stream_buffer.mark_filled(bytes_read)?; - } +impl<'b, R: Reader> StreamParserProvider<'b, R> { + pub fn new(reader: R, buffer: &'b mut [u8]) -> Self { + Self { + content_builder: StreamContentBuilder::new(buffer), + reader, + finished: false, } - Ok(()) } +} - /// Update parser state positions after buffer compaction - fn update_positions_after_compaction(&mut self, offset: usize) -> Result<(), ParseError> { - // Check for positions that would be discarded and need escape mode - // CRITICAL: Position 0 is never discarded, regardless of offset - let needs_escape_mode = match &self.parser_state.state { - crate::shared::State::Key(pos) if *pos > 0 && *pos < offset => Some((*pos, true)), // true = is_key - crate::shared::State::String(pos) if *pos > 0 && *pos < offset => Some((*pos, false)), // false = is_string - crate::shared::State::Number(pos) if *pos > 0 && *pos < offset => { - return Err(ParseError::ScratchBufferFull); - } - _ => None, - }; - - // Handle escape mode transition if needed - if let Some((original_pos, is_key)) = needs_escape_mode { - if is_key { - self.switch_key_to_escape_mode(original_pos, offset)?; - } else { - self.switch_string_to_escape_mode(original_pos, offset)?; - } +impl ByteProvider for StreamParserProvider<'_, R> { + fn next_byte(&mut self) -> Result, ParseError> { + // If buffer is empty, try to fill it first + if self.content_builder.stream_buffer().is_empty() { + self.content_builder + .fill_buffer_from_reader(&mut self.reader)?; } - // Update positions - match &mut self.parser_state.state { - crate::shared::State::None => { - // No position-based state to update - } - crate::shared::State::Key(pos) => { - if *pos > 0 && *pos < offset { - *pos = 0; // Reset for escape mode - } else if *pos >= offset { - *pos = pos.checked_sub(offset).unwrap_or(0); // Safe position adjustment - } - // else: *pos == 0 or *pos < offset with pos == 0, keep as-is - } - crate::shared::State::String(pos) => { - if *pos > 0 && *pos < offset { - *pos = 0; // Reset for escape mode - } else if *pos >= offset { - *pos = pos.checked_sub(offset).unwrap_or(0); // Safe position adjustment - } - // else: *pos == 0 or *pos < offset with pos == 0, keep as-is - } - crate::shared::State::Number(pos) => { - if *pos >= offset { - *pos = pos.checked_sub(offset).unwrap_or(0); // Safe position adjustment - } else { - *pos = 0; // Reset for discarded number start - } + // If still empty after fill attempt, we're at EOF + if self.content_builder.stream_buffer().is_empty() { + if !self.finished { + self.finished = true; + self.content_builder.set_finished(true); } + return Ok(None); } - Ok(()) - } - - /// Switch key processing to escape/copy mode when original position was discarded - fn switch_key_to_escape_mode( - &mut self, - original_pos: usize, - offset: usize, - ) -> Result<(), ParseError> { - // The key start position was in the discarded portion of the buffer - - // For keys, the original_pos now points to the content start (after opening quote) - // If offset > original_pos, it means some actual content was discarded - - // Calculate how much actual key content was discarded - let content_start = original_pos; // Key content starts at original_pos (now tracks content directly) - let discarded_content = offset.saturating_sub(content_start); - - if discarded_content > 0 { - // We lost some actual key content - this would require content recovery - // For now, this is unsupported - return Err(ParseError::ScratchBufferFull); - } - - // No actual content was discarded, we can continue parsing - // We can continue parsing the key from the current position - Ok(()) + // Get byte and advance + let byte = self.content_builder.stream_buffer().current_byte()?; + let _current_pos = self.content_builder.stream_buffer().current_position(); + self.content_builder.stream_buffer_mut().advance()?; + Ok(Some(byte)) } +} - /// Switch string processing to escape/copy mode when original position was discarded - fn switch_string_to_escape_mode( - &mut self, - original_pos: usize, - offset: usize, - ) -> Result<(), ParseError> { - // The string start position was in the discarded portion of the buffer - - // For strings, the original_pos now points to the content start (after opening quote) - // If offset > original_pos, it means some actual content was discarded - - // Calculate how much actual string content was discarded - let content_start = original_pos; // String content starts at original_pos (now tracks content directly) - let discarded_content = offset.saturating_sub(content_start); - - if discarded_content > 0 { - // We lost some actual string content - this would require content recovery - // For now, this is unsupported - return Err(ParseError::ScratchBufferFull); - } - - // No actual content was discarded, we can continue parsing - // We can continue parsing the string from the current position - Ok(()) +impl EscapeHandler for StreamParserProvider<'_, R> { + fn parser_state(&self) -> &State { + self.content_builder.parser_state() } - /// Handle byte accumulation for strings/keys and Unicode escape sequences - fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), ParseError> { - // Use shared literal byte append logic - self.append_literal_byte(byte) + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + self.content_builder.process_unicode_escape_with_collector() } - /// Start escape processing using StreamBuffer - fn start_escape_processing(&mut self) -> Result<(), ParseError> { - // Update escape state in enum - if let ProcessingState::Active { - ref mut in_escape_sequence, - .. - } = self.processing_state - { - *in_escape_sequence = true; - } - - // Initialize escape processing with StreamBuffer if not already started - if !self.stream_buffer.has_unescaped_content() { - if let crate::shared::State::String(start_pos) | crate::shared::State::Key(start_pos) = - self.parser_state.state - { - let current_pos = self.stream_buffer.current_position(); - - // With content tracking, start_pos is the content_start - let content_start = start_pos; - // Content to copy ends right before the escape character - let content_end = if self.unicode_escape_collector.has_pending_high_surrogate() { - // Skip copying high surrogate text when processing low surrogate - content_start - } else { - ContentRange::end_position_excluding_delimiter(current_pos) - }; - - // Estimate max length needed for unescaping (content so far + remaining buffer) - let content_len = content_end.wrapping_sub(content_start); - let max_escaped_len = self - .stream_buffer - .remaining_bytes() - .checked_add(content_len) - .ok_or(ParseError::NumericOverflow)?; - - // Start unescaping with StreamBuffer and copy existing content - self.stream_buffer.start_unescaping_with_copy( - max_escaped_len, - content_start, - content_end, - )?; - } - } - - Ok(()) + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + self.content_builder.handle_simple_escape_char(escape_char) } - /// Append a byte to the StreamBuffer's unescaped content - fn append_byte_to_escape_buffer(&mut self, byte: u8) -> Result<(), ParseError> { - self.stream_buffer - .append_unescaped_byte(byte) - .map_err(|e| e.into()) + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + self.content_builder.begin_escape_sequence() } - /// Queue a reset of unescaped content for the next next_event() call - fn queue_unescaped_reset(&mut self) { - // Set the reset flag in the Active state - if let ProcessingState::Active { - ref mut unescaped_reset_queued, - .. - } = self.processing_state - { - *unescaped_reset_queued = true; - } - } - - /// Apply queued unescaped content reset if flag is set - fn apply_unescaped_reset_if_queued(&mut self) { - // Check the enum field first - let should_reset = if let ProcessingState::Active { - ref mut unescaped_reset_queued, - .. - } = self.processing_state - { - let needs_reset = *unescaped_reset_queued; - *unescaped_reset_queued = false; // Clear the flag - needs_reset - } else { - false - }; - - if should_reset { - self.stream_buffer.clear_unescaped(); - } + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { + self.content_builder.begin_unicode_escape() } } -impl<'b, R: Reader, C: BitStackConfig> ContentExtractor for StreamParser<'b, R, C> { - fn parser_state_mut(&mut self) -> &mut crate::shared::State { - &mut self.parser_state.state +impl ContentExtractor for StreamParserProvider<'_, R> { + fn parser_state_mut(&mut self) -> &mut State { + self.content_builder.parser_state_mut() } fn current_position(&self) -> usize { - self.stream_buffer.current_position() + self.content_builder.current_position() } - fn begin_string_content(&mut self, _pos: usize) { - // StreamParser doesn't need explicit string begin processing - // as it handles content accumulation automatically + fn begin_string_content(&mut self, pos: usize) { + self.content_builder.begin_string_content(pos); } fn unicode_escape_collector_mut( &mut self, ) -> &mut crate::escape_processor::UnicodeEscapeCollector { - &mut self.unicode_escape_collector + self.content_builder.unicode_escape_collector_mut() } fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { - if self.stream_buffer.has_unescaped_content() { - self.create_unescaped_string() - } else { - self.create_borrowed_string(start_pos) - } + self.content_builder.extract_string_content(start_pos) } fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { - if self.stream_buffer.has_unescaped_content() { - self.create_unescaped_key() - } else { - self.create_borrowed_key(start_pos) - } + self.content_builder.extract_key_content(start_pos) } fn extract_number_content( @@ -531,149 +176,70 @@ impl<'b, R: Reader, C: BitStackConfig> ContentExtractor for StreamParser<'b, R, start_pos: usize, from_container_end: bool, ) -> Result, ParseError> { - // Use shared number parsing with StreamParser-specific document end detection - let at_document_end = matches!(self.processing_state, ProcessingState::Finished); - crate::number_parser::parse_number_with_delimiter_logic( - &self.stream_buffer, - start_pos, - from_container_end, - at_document_end, - ) + self.content_builder + .extract_number_content(start_pos, from_container_end) } -} -impl<'b, R: Reader, C: BitStackConfig> EscapeHandler for StreamParser<'b, R, C> { - fn parser_state(&self) -> &crate::shared::State { - &self.parser_state.state + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + self.content_builder + .extract_number(start_pos, from_container_end, finished) } - fn process_unicode_escape_with_collector(&mut self) -> Result<(), crate::ParseError> { - // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts - let utf8_bytes_result = { - let current_pos = self.stream_buffer.current_position(); - let hex_slice_provider = |start, end| { - self.stream_buffer - .get_string_slice(start, end) - .map_err(Into::into) - }; - - let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, _escape_start_pos) = - crate::escape_processor::process_unicode_escape_sequence( - current_pos, - &mut self.unicode_escape_collector, - hex_slice_provider, - &mut utf8_buf, - )?; - - // Copy UTF-8 bytes to avoid borrow conflicts - utf8_bytes_opt.map(|bytes| { - let mut copy = [0u8; 4]; - let len = bytes.len(); - if let Some(dest) = copy.get_mut(..len) { - dest.copy_from_slice(bytes); - } - (copy, len) - }) + /// Override the default validate_and_extract_number to use the finished state + fn validate_and_extract_number( + &mut self, + from_container_end: bool, + ) -> Result, ParseError> { + let start_pos = match *self.parser_state() { + crate::shared::State::Number(pos) => pos, + _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()), }; - // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) - if let Some((utf8_bytes, len)) = utf8_bytes_result { - // StreamParser handles all escape sequences the same way - append bytes to escape buffer - // Use safe slice access to avoid panic - if let Some(valid_bytes) = utf8_bytes.get(..len) { - for &byte in valid_bytes { - self.append_byte_to_escape_buffer(byte)?; - } - } - } - - // Update escape state in enum - Unicode escape processing is complete - if let ProcessingState::Active { - ref mut in_escape_sequence, - .. - } = self.processing_state - { - *in_escape_sequence = false; - } - - Ok(()) - } - - fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), crate::ParseError> { - // Update escape state in enum - if let ProcessingState::Active { - ref mut in_escape_sequence, - .. - } = self.processing_state - { - *in_escape_sequence = false; - } - - self.append_byte_to_escape_buffer(escape_char)?; - Ok(()) + *self.parser_state_mut() = crate::shared::State::None; + // Use the finished-aware extract_number method instead of extract_number_content + self.extract_number(start_pos, from_container_end, self.finished) } +} - /// Begin escape sequence processing - calls StreamParser's start_escape_processing - fn begin_escape_sequence(&mut self) -> Result<(), crate::ParseError> { - self.start_escape_processing() +/// Shared methods for StreamParser with any BitStackConfig +impl StreamParser<'_, R, C> { + /// Get the next JSON event from the stream + fn next_event_impl(&mut self) -> Result, ParseError> { + // Use the unified ParserCore implementation with StreamParser-specific timing + // This achieves the same pattern as SliceParser: self.parser_core.next_event_impl_unified() + + // StreamParser needs byte accumulation for string parsing, so use the accumulator version + self.parser_core.next_event_impl_unified_with_accumulator( + &mut self.provider, + crate::parser_core::EscapeTiming::OnEnd, + |provider, byte| { + // Delegate to the StreamContentBuilder's byte accumulation logic + provider.content_builder.handle_byte_accumulation(byte) + }, + ) } - /// Append a single literal byte - StreamParser's per-byte accumulation pattern - fn append_literal_byte(&mut self, byte: u8) -> Result<(), crate::ParseError> { - // Check if we're in a string or key state and should accumulate bytes - let in_string_mode = matches!( - self.parser_state.state, - crate::shared::State::String(_) | crate::shared::State::Key(_) - ); - - if in_string_mode { - // Access escape state from enum - let in_escape = if let ProcessingState::Active { - in_escape_sequence, .. - } = &self.processing_state - { - *in_escape_sequence - } else { - false - }; - - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if !in_escape - && self.stream_buffer.has_unescaped_content() - && !self.unicode_escape_collector.has_pending_high_surrogate() - { - self.append_byte_to_escape_buffer(byte)?; - } - } - - Ok(()) - } + // The compaction and helper methods are now handled by the provider + // These methods can be removed since they're not needed with the new architecture } -impl<'b, R: Reader, C: BitStackConfig> PullParser for StreamParser<'b, R, C> { +impl PullParser for StreamParser<'_, R, C> { fn next_event(&mut self) -> Result, ParseError> { - self.next_event_impl() - } -} - -impl<'b, R: Reader, C: BitStackConfig> crate::shared::ByteProvider for StreamParser<'b, R, C> { - fn next_byte(&mut self) -> Result, ParseError> { - // If buffer is empty, try to fill it first - if self.stream_buffer.is_empty() { - self.fill_buffer_from_reader()?; + // Check if we're already finished (similar to SliceParser's is_past_end check) + if self.provider.finished { + return Ok(Event::EndDocument); } - // If still empty after fill attempt, we're at EOF - if self.stream_buffer.is_empty() { - return Ok(None); - } + self.provider + .content_builder + .apply_unescaped_reset_if_queued(); - // Get byte and advance - let byte = self.stream_buffer.current_byte()?; - self.stream_buffer.advance()?; - Ok(Some(byte)) + self.next_event_impl() } }