diff --git a/picojson/examples/push_parser_demo.rs b/picojson/examples/push_parser_demo.rs new file mode 100644 index 0000000..fec9309 --- /dev/null +++ b/picojson/examples/push_parser_demo.rs @@ -0,0 +1,125 @@ +// Example demonstrating PushParser with SAX-style event handling + +use picojson::{DefaultConfig, Event, ParseError, PushParseError, PushParser, PushParserHandler}; + +/// A simple event handler that prints JSON events as they arrive +struct JsonEventPrinter { + indent: usize, + event_count: usize, +} + +impl JsonEventPrinter { + fn new() -> Self { + Self { + indent: 0, + event_count: 0, + } + } + + fn indent_str(&self) -> String { + " ".repeat(self.indent) + } +} + +impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for JsonEventPrinter { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + self.event_count += 1; + + match event { + Event::StartObject => { + println!("{}🏁 StartObject", self.indent_str()); + self.indent += 1; + } + Event::EndObject => { + self.indent = self.indent.saturating_sub(1); + println!("{}🏁 EndObject", self.indent_str()); + } + Event::StartArray => { + println!("{}📋 StartArray", self.indent_str()); + self.indent += 1; + } + Event::EndArray => { + self.indent = self.indent.saturating_sub(1); + println!("{}📋 EndArray", self.indent_str()); + } + Event::Key(key) => { + println!("{}🔑 Key: '{}'", self.indent_str(), key.as_str()); + } + Event::String(s) => { + println!("{}📝 String: '{}'", self.indent_str(), s.as_str()); + } + Event::Number(num) => { + println!("{}🔢 Number: {}", self.indent_str(), num); + } + Event::Bool(b) => { + println!("{}✅ Bool: {}", self.indent_str(), b); + } + Event::Null => { + println!("{}⭕ Null", self.indent_str()); + } + Event::EndDocument => { + println!("{}🏁 EndDocument", self.indent_str()); + } + } + Ok(()) + } +} + +fn main() -> Result<(), PushParseError> { + println!("🚀 PushParser Demo - SAX-style JSON Processing"); + println!("==============================================="); + println!(); + + // Example JSON with various features to demonstrate push parsing + let json_chunks = vec![ + br#"{"name": "Pic"#.as_slice(), + br#"oJSON", "version": 1.0, "#.as_slice(), + br#""features": ["fast", "no_std""#.as_slice(), + br#", "zero\u0041lloc"], "escapes": "hello\nworld", "#.as_slice(), + br#""nested": {"data": [1, 2.5, true, null]}}"#.as_slice(), + ]; + + let full_json = json_chunks.concat(); + let json_str = std::str::from_utf8(&full_json)?; + + println!("📄 Input JSON: {}", json_str); + println!("📏 Total size: {} bytes", full_json.len()); + println!( + "📦 Processing in {} chunks (simulates streaming)", + json_chunks.len() + ); + println!(); + + // Create handler and parser + let handler = JsonEventPrinter::new(); + let mut buffer = [0u8; 512]; // Scratch buffer for escape processing + let buffer_size = buffer.len(); + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + println!("🔄 Starting PushParser with incremental data feeding:"); + println!(" Buffer size: {} bytes", buffer_size); + println!(); + + // Feed data chunk by chunk to demonstrate streaming capability + for (i, chunk) in json_chunks.iter().enumerate() { + println!("📨 Processing chunk {} ({} bytes):", i + 1, chunk.len()); + let chunk_str = std::str::from_utf8(chunk)?; + println!(" Chunk data: {:?}", chunk_str); + + // Write chunk to parser - events are handled immediately + parser.write(chunk)?; + println!(); + } + + // Signal end of input and retrieve the handler + println!("🔚 Finishing parsing..."); + let handler = parser.finish()?; + + println!(); + println!( + "✅ Successfully processed {} events with PushParser!", + handler.event_count + ); + + Ok(()) +} diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index fc0d452..4f505d2 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -22,15 +22,29 @@ pub struct ParserCore { pub parser_state: ParserState, /// Tracks if the parser is currently inside any escape sequence (\n, \uXXXX, etc.) in_escape_sequence: bool, + /// Whether this parser handles chunked input (true for PushParser, false for Slice/Stream) + /// When true, running out of input returns EndOfData. When false, calls tokenizer.finish(). + handles_chunked_input: bool, } impl ParserCore { - /// Create a new ParserCore + /// Create a new ParserCore for non-chunked parsers (SliceParser, StreamParser) pub fn new() -> Self { Self { tokenizer: Tokenizer::new(), parser_state: ParserState::new(), in_escape_sequence: false, + handles_chunked_input: false, + } + } + + /// Create a new ParserCore for chunked parsers (PushParser) + pub fn new_chunked() -> Self { + Self { + tokenizer: Tokenizer::new(), + parser_state: ParserState::new(), + in_escape_sequence: false, + handles_chunked_input: true, } } @@ -87,15 +101,22 @@ impl ParserCore { byte_accumulator(provider, byte)?; } } else { - { - let mut finish_callback = - create_tokenizer_callback(&mut self.parser_state.evts); - let _bytes_processed = self.tokenizer.finish(&mut finish_callback)?; - } // Drop the callback to release the borrow - - // If finish() generated events, process them. Otherwise, return EndDocument. - if !have_events(&self.parser_state.evts) { - return Ok(Event::EndDocument); + // Handle end of input - behavior depends on parser type + if self.handles_chunked_input { + // For chunked parsers (PushParser), return EndOfData so they can handle chunk boundaries + return Err(ParseError::EndOfData); + } else { + // For non-chunked parsers (SliceParser, StreamParser), finish the document + { + let mut finish_callback = + create_tokenizer_callback(&mut self.parser_state.evts); + let _bytes_processed = self.tokenizer.finish(&mut finish_callback)?; + } // Drop the callback to release the borrow + + // If finish() generated events, process them. Otherwise, return EndDocument. + if !have_events(&self.parser_state.evts) { + return Ok(Event::EndDocument); + } } } } diff --git a/picojson/src/lib.rs b/picojson/src/lib.rs index 4c86917..1a1f1f6 100644 --- a/picojson/src/lib.rs +++ b/picojson/src/lib.rs @@ -97,3 +97,8 @@ pub use stream_parser::{Reader, StreamParser}; mod chunk_reader; pub use chunk_reader::ChunkReader; + +mod push_content_builder; +mod push_parser; +pub use push_content_builder::PushParserHandler; +pub use push_parser::{PushParseError, PushParser}; diff --git a/picojson/src/push_content_builder.rs b/picojson/src/push_content_builder.rs new file mode 100644 index 0000000..9029b5a --- /dev/null +++ b/picojson/src/push_content_builder.rs @@ -0,0 +1,536 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Content extractor for PushParser. + +use crate::escape_processor::{EscapeProcessor, UnicodeEscapeCollector}; +use crate::event_processor::ContentExtractor; +use crate::shared::{DataSource, State}; +use crate::stream_buffer::StreamBuffer; +use crate::{Event, JsonNumber, ParseError, String}; + +/// A trait for handling events from a SAX-style push parser. +/// +/// # Generic Parameters +/// +/// * `'input` - Lifetime for the input data being parsed +/// * `'scratch` - Lifetime for the scratch buffer used for temporary storage +/// * `E` - The error type that can be returned by the handler +pub trait PushParserHandler<'input, 'scratch, E> { + /// Handles a single, complete JSON event. + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), E>; +} + +/// Content extractor for PushParser. +pub struct PushContentExtractor<'input, 'scratch> { + /// StreamBuffer for single-buffer input and escape processing + stream_buffer: StreamBuffer<'scratch>, + /// Parser state tracking + parser_state: State, + /// Unicode escape collector for \uXXXX sequences + unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to reset unescaped content on next operation + unescaped_reset_queued: bool, + /// Position offset for tracking absolute positions across chunks + position_offset: usize, + /// Current position within the current chunk + current_position: usize, + /// Position where the current token started + token_start_pos: usize, + /// Whether we're using the unescaped buffer for current content + using_unescaped_buffer: bool, + /// The current chunk of data being processed + current_chunk: &'input [u8], + /// The cursor for the current chunk + chunk_cursor: usize, + /// Whether we're currently collecting Unicode escape hex digits + in_unicode_escape: bool, + /// Whether we're currently processing a simple escape sequence + in_simple_escape: bool, +} + +impl<'input, 'scratch> PushContentExtractor<'input, 'scratch> { + /// Create a new PushContentExtractor + pub fn new(buffer: &'scratch mut [u8]) -> Self { + Self { + stream_buffer: StreamBuffer::new(buffer), + parser_state: State::None, + unicode_escape_collector: UnicodeEscapeCollector::new(), + unescaped_reset_queued: false, + position_offset: 0, + current_position: 0, + token_start_pos: 0, + using_unescaped_buffer: false, + current_chunk: &[], + chunk_cursor: 0, + in_unicode_escape: false, + in_simple_escape: false, + } + } + + /// Set the current chunk of data to be processed + pub fn set_chunk(&mut self, chunk: &'input [u8]) { + self.current_chunk = chunk; + self.chunk_cursor = 0; + } + + /// Reset input processing state + pub fn reset_input(&mut self) { + self.current_chunk = &[]; + self.chunk_cursor = 0; + } + + /// Update the current position + pub fn set_current_position(&mut self, pos: usize) { + self.current_position = pos; + } + + /// Update the position offset for chunk processing + pub fn set_position_offset(&mut self, offset: usize) { + self.position_offset = offset; + } + + /// Update position offset by adding to it + pub fn add_position_offset(&mut self, amount: usize) { + self.position_offset += amount; + } + + /// Set the token start position + pub fn set_token_start_pos(&mut self, pos: usize) { + self.token_start_pos = pos; + } + + /// Get the token start position + pub fn token_start_pos(&self) -> usize { + self.token_start_pos + } + + /// Set whether we're using the unescaped buffer + pub fn set_using_unescaped_buffer(&mut self, using: bool) { + self.using_unescaped_buffer = using; + } + + /// Check if we're using the unescaped buffer + pub fn using_unescaped_buffer(&self) -> bool { + self.using_unescaped_buffer + } + + /// Clear the unescaped buffer + pub fn clear_unescaped(&mut self) { + self.stream_buffer.clear_unescaped(); + } + + /// Append a byte to the unescaped buffer + pub fn append_unescaped_byte(&mut self, byte: u8) -> Result<(), ParseError> { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(ParseError::from) + } + + /// Truncate unescaped content by removing bytes from the end + pub fn truncate_unescaped_by(&mut self, count: usize) { + self.stream_buffer.truncate_unescaped_by(count); + } + + /// Get the position offset + pub fn position_offset(&self) -> usize { + self.position_offset + } + + /// Get mutable access to the unicode escape collector + pub fn unicode_escape_collector_mut(&mut self) -> &mut UnicodeEscapeCollector { + &mut self.unicode_escape_collector + } + + /// Process simple escape sequence events that have similar patterns between parsers + pub fn process_simple_escape_event( + &mut self, + escape_token: &crate::ujson::EventToken, + ) -> Result<(), ParseError> { + // Clear any pending high surrogate state when we encounter a simple escape + // This ensures that interrupted surrogate pairs (like \uD801\n\uDC37) are properly rejected + self.unicode_escape_collector_mut().reset_all(); + + // Use unified escape token processing from EscapeProcessor + let unescaped_char = EscapeProcessor::process_escape_token(escape_token)?; + + // Only process if we're inside a string or key + match self.parser_state { + State::String(_) | State::Key(_) => { + self.append_unescaped_byte(unescaped_char)?; + } + _ => {} // Ignore if not in string/key context + } + + Ok(()) + } + + /// Apply queued unescaped content reset if needed + pub fn apply_unescaped_reset_if_queued(&mut self) { + if self.unescaped_reset_queued { + self.stream_buffer.clear_unescaped(); + self.unescaped_reset_queued = false; + self.using_unescaped_buffer = false; // Always reset the flag when buffer is cleared + } + } + + /// Handle byte accumulation with selective logic based on current state + pub fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), ParseError> { + // Check if we're currently processing any type of escape sequence + if self.in_unicode_escape { + // During Unicode escape processing, try to feed hex digits directly to the collector + if crate::escape_processor::EscapeProcessor::validate_hex_digit(byte).is_ok() { + let is_complete = self.unicode_escape_collector.add_hex_digit(byte)?; + if is_complete { + // Process the complete escape sequence immediately + let mut utf8_buffer = [0u8; 4]; + let (utf8_bytes_opt, _surrogate_state_changed) = self + .unicode_escape_collector + .process_to_utf8(&mut utf8_buffer)?; + + if let Some(utf8_bytes) = utf8_bytes_opt { + // Write the UTF-8 bytes directly to the scratch buffer + for &utf8_byte in utf8_bytes { + self.stream_buffer + .append_unescaped_byte(utf8_byte) + .map_err(ParseError::from)?; + } + } + // Reset collector and exit Unicode escape mode + self.unicode_escape_collector.reset(); + self.in_unicode_escape = false; + } + return Ok(()); + } else { + // Non-hex digit during Unicode escape - this shouldn't happen in valid JSON + self.in_unicode_escape = false; + } + } else if self.in_simple_escape { + // Check if this is the start of a Unicode escape (\uXXXX) + if byte == b'u' { + // This is a Unicode escape - do NOT accumulate the 'u', let the escape processor handle it + self.in_simple_escape = false; + return Ok(()); // Skip accumulation for 'u' in Unicode escapes + } else { + // This is a simple escape - skip the raw escape character + self.in_simple_escape = false; + return Ok(()); + } + } + + // Regular byte accumulation logic for non-hex digits or when not in Unicode escape + let should_accumulate = match self.parser_state { + State::String(_) | State::Key(_) => { + // We're in string/key context - accumulate if using unescaped buffer + // BUT: skip accumulation of escape characters when in Unicode escape mode + // OR when we encounter a backslash (which will be handled by escape processor) + if self.in_unicode_escape || self.in_simple_escape { + // Don't accumulate escape characters - they're handled by escape processors + false + } else if byte == b'\\' { + // Don't accumulate backslashes - they trigger escape processing + false + } else if byte == b'"' { + // Don't accumulate closing quotes - they mark end of string + false + } else { + self.using_unescaped_buffer + } + } + State::Number(_) => { + // We're in number context - accumulate if using unescaped buffer (for numbers spanning chunks) + self.using_unescaped_buffer + } + _ => false, // Not in string/key/number context - don't accumulate + }; + + if should_accumulate { + self.append_unescaped_byte(byte)?; + } + + Ok(()) + } + + /// Queue a reset of unescaped content for the next operation + fn queue_unescaped_reset(&mut self) { + self.unescaped_reset_queued = true; + } +} + +impl ContentExtractor for PushContentExtractor<'_, '_> { + fn next_byte(&mut self) -> Result, ParseError> { + if self.chunk_cursor < self.current_chunk.len() { + let byte = self.current_chunk[self.chunk_cursor]; + self.chunk_cursor += 1; + self.current_position = self.position_offset + self.chunk_cursor - 1; + Ok(Some(byte)) + } else { + Ok(None) + } + } + + fn current_position(&self) -> usize { + self.current_position + } + + fn begin_string_content(&mut self, pos: usize) { + self.token_start_pos = pos; + self.using_unescaped_buffer = false; + self.stream_buffer.clear_unescaped(); + } + + fn parser_state_mut(&mut self) -> &mut State { + &mut self.parser_state + } + + fn parser_state(&self) -> &State { + &self.parser_state + } + + fn unicode_escape_collector_mut(&mut self) -> &mut UnicodeEscapeCollector { + &mut self.unicode_escape_collector + } + + fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { + if self.using_unescaped_buffer { + // We have unescaped content - use it + self.queue_unescaped_reset(); + let content_slice = self.get_unescaped_slice()?; + let content_str = core::str::from_utf8(content_slice)?; + Ok(Event::String(String::Unescaped(content_str))) + } else { + // No escapes - use borrowed content + // PushParser: current_position points AT the closing quote, but get_content_piece expects + // position AFTER the closing quote, so add 1 + let content_piece = + crate::shared::get_content_piece(self, start_pos + 1, self.current_position + 1)?; + content_piece.into_string().map(Event::String) + } + } + + fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { + if self.using_unescaped_buffer { + // Content is in scratch buffer - get the complete token from there + self.queue_unescaped_reset(); + let content_slice = self.get_unescaped_slice()?; + let content_str = core::str::from_utf8(content_slice)?; + Ok(Event::Key(String::Unescaped(content_str))) + } else { + // The entire token was contained in the current chunk - use direct extraction + let content_piece = + crate::shared::get_content_piece(self, start_pos + 1, self.current_position + 1)?; + content_piece.into_string().map(Event::Key) + } + } + + fn extract_number( + &mut self, + start_pos: usize, + _from_container_end: bool, + _finished: bool, + ) -> Result, ParseError> { + let number_bytes = if self.using_unescaped_buffer { + // Content is in scratch buffer - get the complete token from there + self.queue_unescaped_reset(); + self.get_unescaped_slice()? + } else { + // The entire token was contained in the current chunk - use direct extraction + let content_piece = + crate::shared::get_content_piece(self, start_pos + 1, self.current_position + 1)?; + content_piece.as_bytes() + }; + + let json_number = JsonNumber::from_slice(number_bytes)?; + Ok(Event::Number(json_number)) + } + + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // With the selective accumulation approach, Unicode escape processing should have + // already happened during byte accumulation via handle_byte_accumulation(). + // This method is called at the end of a Unicode escape sequence by the event processor. + // If the collector still has incomplete data, it means we're dealing with chunked input + // where hex digits span chunk boundaries, OR we have a bug where hex digits aren't + // being fed properly. + Ok(()) + } + + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Now we know this is definitely a simple escape, not Unicode + self.in_simple_escape = false; // Reset flag since we're processing it now + + if self.using_unescaped_buffer { + self.stream_buffer + .append_unescaped_byte(escape_char) + .map_err(ParseError::from) + } else { + // This shouldn't happen if begin_escape_sequence was called properly + Err(ParseError::Unexpected( + crate::shared::UnexpectedState::StateMismatch, + )) + } + } + + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + // Implement copy-on-escape: copy the clean part before the escape to unescaped buffer + if !self.using_unescaped_buffer { + if let State::String(start_pos) | State::Key(start_pos) = self.parser_state { + // start_pos points to the opening quote, so content starts at start_pos + 1 + let content_start = start_pos + 1; + // Current position is where the escape character (\) is located + // We want to copy content up to (but not including) the escape character + let content_end = self.current_position; + + // Copy the clean part to the unescaped buffer + if content_end > content_start { + // Convert absolute positions to relative positions within the current data chunk + let slice_start = content_start.saturating_sub(self.position_offset); + let slice_end = content_end.saturating_sub(self.position_offset); + + if slice_end <= self.current_chunk.len() && slice_start <= slice_end { + let clean_slice = &self.current_chunk[slice_start..slice_end]; + + for &byte in clean_slice { + self.stream_buffer.append_unescaped_byte(byte)?; + } + } else { + return Err(ParseError::Unexpected( + crate::shared::UnexpectedState::InvalidSliceBounds, + )); + } + } + + // Mark that we're now using the unescaped buffer + self.using_unescaped_buffer = true; + } + } + + // Set a general escape flag to skip the next byte (which will be the escape character) + // This will be overridden if begin_unicode_escape is called + self.in_simple_escape = true; + self.in_unicode_escape = false; + Ok(()) + } + + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { + // Start of unicode escape sequence - reset collector for new sequence and enter escape mode + // Note: we preserve pending high surrogate state for surrogate pair processing + self.unicode_escape_collector.reset(); + self.in_unicode_escape = true; + self.in_simple_escape = false; // Override the simple escape flag set by begin_escape_sequence + + // CRITICAL: The tokenizer processes \u and the first hex digit before emitting Begin(UnicodeEscape) + // Since we no longer accumulate the 'u' character, we only need to handle the first hex digit + // that was accumulated before this event arrived + if self.using_unescaped_buffer { + // Get current buffer content and check if it ends with a hex digit (the first one) + if let Ok(current_content) = self.stream_buffer.get_unescaped_slice() { + if !current_content.is_empty() { + let hex_pos = current_content.len() - 1; + + if crate::escape_processor::EscapeProcessor::validate_hex_digit( + current_content[hex_pos], + ) + .is_ok() + { + let first_hex_digit = current_content[hex_pos]; + + // Remove the last hex digit by truncating the buffer + self.stream_buffer.truncate_unescaped_by(1); + + // Now feed the first hex digit to the Unicode collector + let is_complete = self + .unicode_escape_collector + .add_hex_digit(first_hex_digit)?; + if is_complete { + // This shouldn't happen for the first hex digit, but handle it just in case + } + } + } + } + } + + Ok(()) + } +} + +impl<'input, 'scratch> DataSource<'input, 'scratch> for PushContentExtractor<'input, 'scratch> { + fn get_borrowed_slice( + &'input self, + start: usize, + end: usize, + ) -> Result<&'input [u8], ParseError> { + // For now, always try to read from current input chunk regardless of escape mode + // The issue was that process_unicode_escape_sequence calls this directly to get hex digits + // But for PushParser, hex digits might not be in the current chunk due to chunked processing + + // Convert absolute positions to relative positions within the current data chunk + let slice_start = start.saturating_sub(self.position_offset); + let slice_end = end.saturating_sub(self.position_offset); + + // Check if the requested range is within the current chunk + if slice_end > self.current_chunk.len() || slice_start > slice_end { + return Err(ParseError::Unexpected( + crate::shared::UnexpectedState::InvalidSliceBounds, + )); + } + + let result = &self.current_chunk[slice_start..slice_end]; + Ok(result) + } + + fn get_unescaped_slice(&'scratch self) -> Result<&'scratch [u8], ParseError> { + self.stream_buffer + .get_unescaped_slice() + .map_err(ParseError::from) + } + + fn has_unescaped_content(&self) -> bool { + self.using_unescaped_buffer + } +} + +impl PushContentExtractor<'_, '_> { + /// Copy partial content from current chunk to scratch buffer when chunk boundary reached + pub fn copy_partial_content_to_scratch(&mut self) -> Result<(), ParseError> { + // Determine the start of the current token content based on parser state + let content_start = match self.parser_state { + State::String(start_pos) | State::Key(start_pos) => { + // For strings and keys, content starts after the opening quote + start_pos + 1 + } + State::Number(start_pos) => { + // For numbers, start_pos points to the character before the first digit + // so we need to add 1 to get to the actual number content + start_pos + 1 + } + _ => { + return Ok(()); + } + }; + + // The end is the current position (where we are in the chunk) + let content_end = self.current_position + 1; + + // Copy the slice of partial content from the current chunk + if content_end > content_start { + // Get the range within the current chunk + let slice_start = content_start.saturating_sub(self.position_offset); + let slice_end = content_end.saturating_sub(self.position_offset); + + if slice_end <= self.current_chunk.len() && slice_start < slice_end { + let partial_slice = &self.current_chunk[slice_start..slice_end]; + + // Copy these bytes directly into the stream_buffer (the scratch space) + for &byte in partial_slice { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(ParseError::from)?; + } + + // Activate scratch buffer mode so subsequent content is also appended + self.using_unescaped_buffer = true; + } + } + + Ok(()) + } +} diff --git a/picojson/src/push_parser.rs b/picojson/src/push_parser.rs new file mode 100644 index 0000000..97e83c6 --- /dev/null +++ b/picojson/src/push_parser.rs @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! A SAX-style JSON push parser. +//! +//! Clean implementation based on handler_design pattern with proper HRTB lifetime management. + +use crate::event_processor::{ContentExtractor, EscapeTiming, ParserCore}; +use crate::push_content_builder::{PushContentExtractor, PushParserHandler}; +use crate::shared::{DataSource, State}; +use crate::stream_buffer::StreamBufferError; +use crate::{ujson, BitStackConfig, Event, ParseError}; + +/// A SAX-style JSON push parser. +/// +/// Generic over BitStack storage type for configurable nesting depth. Parsing +/// events are returned to the handler. +/// +/// # Generic Parameters +/// +/// * `'scratch` - Lifetime for the scratch buffer used for temporary storage +/// * `H` - The event handler type that implements [`PushParserHandler`] +/// * `C` - BitStack configuration type that implements [`BitStackConfig`] +pub struct PushParser<'input, 'scratch, H, C> +where + C: BitStackConfig, +{ + /// Content extractor that handles content extraction and event emission + extractor: PushContentExtractor<'input, 'scratch>, + /// The handler that receives events + handler: H, + /// Core parser logic shared with other parsers + core: ParserCore, +} + +impl<'input, 'scratch, H, C> PushParser<'input, 'scratch, H, C> +where + C: BitStackConfig, +{ + /// Creates a new `PushParser`. + pub fn new(handler: H, buffer: &'scratch mut [u8]) -> Self { + Self { + extractor: PushContentExtractor::new(buffer), + handler, + core: ParserCore::new_chunked(), + } + } + + /// Processes a chunk of input data. + pub fn write(&mut self, data: &'input [u8]) -> Result<(), PushParseError> + where + H: for<'a, 'b> PushParserHandler<'a, 'b, E>, + E: From, + { + // Apply any queued buffer resets + self.extractor.apply_unescaped_reset_if_queued(); + + // Set the input slice for the extractor to iterate over + self.extractor.set_chunk(data); + + // Use ParserCore to process all bytes in the chunk + loop { + match self.core.next_event_impl_with_flags( + &mut self.extractor, + EscapeTiming::OnEnd, // PushParser uses OnEnd timing like StreamParser + |extractor, byte| { + // Selective accumulation: let PushContentExtractor decide based on its state + // whether this byte should be accumulated or processed directly + extractor.handle_byte_accumulation(byte) + }, + true, // always_accumulate_during_escapes: ensure all hex digits reach the accumulator + ) { + Ok(Event::EndDocument) => { + // EndDocument during write() means we've consumed all bytes in current chunk + break; + } + Ok(event) => { + // Handle all other events normally + self.handler + .handle_event(event) + .map_err(PushParseError::Handler)?; + + // Apply any queued buffer resets after the event has been processed + // This ensures that buffer content from previous tokens doesn't leak into subsequent ones + self.extractor.apply_unescaped_reset_if_queued(); + } + Err(ParseError::EndOfData) => { + // No more events available from current chunk + break; + } + Err(e) => { + return Err(PushParseError::Parse(e)); + } + } + } + + // Check for chunk boundary condition - if still processing a token when chunk ends + let extractor_state = self.extractor.parser_state(); + + if matches!( + extractor_state, + State::String(_) | State::Key(_) | State::Number(_) + ) { + // If we haven't already started using the scratch buffer (e.g., due to escapes) + if !self.extractor.has_unescaped_content() { + // Copy the partial content from this chunk to scratch buffer before it's lost + self.extractor.copy_partial_content_to_scratch()?; + } else { + // Special case: For Numbers, check if the scratch buffer is actually empty + // This handles the byte-by-byte case where the flag is stale from previous Key processing + if matches!(extractor_state, State::Number(_)) { + let buffer_slice = self.extractor.get_unescaped_slice().unwrap_or(&[]); + let buffer_empty = buffer_slice.is_empty(); + + if buffer_empty { + self.extractor.copy_partial_content_to_scratch()?; + } + } + } + } + + // Reset input slice + self.extractor.reset_input(); + + // Update position offset for next call + self.extractor.add_position_offset(data.len()); + + Ok(()) + } + + /// Finishes parsing, flushes any remaining events, and returns the handler. + /// This method consumes the parser. + pub fn finish(mut self) -> Result> + where + H: for<'a, 'b> PushParserHandler<'a, 'b, E>, + { + // Check that the JSON document is complete (all containers closed) + // Use a no-op callback since we don't expect any more events + let mut no_op_callback = |_event: ujson::Event, _pos: usize| {}; + let _bytes_processed = self.core.tokenizer.finish(&mut no_op_callback)?; + + // Handle any remaining content in the buffer + if *self.extractor.parser_state() != State::None { + return Err(crate::push_parser::PushParseError::Parse( + ParseError::EndOfData, + )); + } + + // Emit EndDocument event + self.handler + .handle_event(Event::EndDocument) + .map_err(PushParseError::Handler)?; + + Ok(self.handler) + } +} + +/// An error that can occur during push-based parsing. +#[derive(Debug, PartialEq)] +pub enum PushParseError { + /// An error occurred within the parser itself. + Parse(ParseError), + /// An error was returned by the user's handler. + Handler(E), +} + +impl From for PushParseError { + fn from(e: ujson::Error) -> Self { + PushParseError::Parse(e.into()) + } +} + +impl From for PushParseError { + fn from(e: ParseError) -> Self { + PushParseError::Parse(e) + } +} + +impl From for PushParseError { + fn from(e: StreamBufferError) -> Self { + PushParseError::Parse(e.into()) + } +} + +impl From for PushParseError { + fn from(e: core::str::Utf8Error) -> Self { + PushParseError::Parse(ParseError::InvalidUtf8(e)) + } +} diff --git a/picojson/src/shared.rs b/picojson/src/shared.rs index 3829bb0..e609503 100644 --- a/picojson/src/shared.rs +++ b/picojson/src/shared.rs @@ -244,6 +244,14 @@ where } } } + + /// Returns the underlying byte slice, whether from input or scratch. + pub fn as_bytes(&self) -> &'scratch [u8] { + match self { + ContentPiece::Input(bytes) => bytes, + ContentPiece::Scratch(bytes) => bytes, + } + } } pub fn from_utf8(v: &[u8]) -> Result<&str, ParseError> { diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index 2e15772..322dc71 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -255,6 +255,11 @@ impl<'a> StreamBuffer<'a> { } } + /// Truncate unescaped content by removing the specified number of bytes from the end + pub fn truncate_unescaped_by(&mut self, count: usize) { + self.unescaped_len = self.unescaped_len.saturating_sub(count); + } + /// Get a string slice from the buffer (zero-copy) /// Used for strings without escapes pub fn get_string_slice(&self, start: usize, end: usize) -> Result<&[u8], StreamBufferError> { diff --git a/picojson/tests/input_buffer_full_test.rs b/picojson/tests/input_buffer_full_test.rs new file mode 100644 index 0000000..2923fb3 --- /dev/null +++ b/picojson/tests/input_buffer_full_test.rs @@ -0,0 +1,136 @@ +// Test for InputBufferFull error variant +use picojson::{ParseError, PullParser, StreamParser}; +use std::io; + +/// Mock reader that simulates a scenario where input buffer limits could be exceeded +struct LargeDataReader { + data: Vec, + position: usize, + chunk_size: usize, +} + +impl LargeDataReader { + fn new(json_data: &str, chunk_size: usize) -> Self { + Self { + data: json_data.as_bytes().to_vec(), + position: 0, + chunk_size, + } + } +} + +impl picojson::Reader for LargeDataReader { + type Error = io::Error; + + fn read(&mut self, buf: &mut [u8]) -> Result { + if self.position >= self.data.len() { + return Ok(0); // End of stream + } + + let remaining = self.data.len() - self.position; + let to_read = std::cmp::min(std::cmp::min(buf.len(), self.chunk_size), remaining); + + buf[..to_read].copy_from_slice(&self.data[self.position..self.position + to_read]); + self.position += to_read; + + Ok(to_read) + } +} + +#[test] +fn test_input_buffer_full_scenario() { + // Create a very large JSON document that could potentially overflow input buffers + let large_object = format!( + r#"{{"key": "{}"}}"#, + "x".repeat(10000) // Very long string value + ); + + // Use a very small buffer that would be insufficient for the large content + let mut buffer = [0u8; 32]; // Intentionally small buffer + let reader = LargeDataReader::new(&large_object, 16); // Small read chunks + + let mut parser = StreamParser::new(reader, &mut buffer); + + // Attempt to parse the large document with insufficient buffer space + let mut events = Vec::new(); + loop { + match parser.next_event() { + Ok(event) => { + events.push(format!("{:?}", event)); + if matches!(event, picojson::Event::EndDocument) { + break; + } + } + Err(e) => { + // InputBufferFull is now properly implemented as of stream_content_builder.rs fix + if matches!( + e, + ParseError::InputBufferFull | ParseError::ScratchBufferFull + ) { + // This is an expected error for oversized tokens. + return; + } + panic!("Unexpected error: {:?}", e); + } + } + } + + // If we reach here, the parser somehow managed to handle the large document + // This is unexpected behavior that should cause the test to fail + panic!( + "Test should have failed: Parser unexpectedly succeeded in handling large document with small buffer. \ + Expected ScratchBufferFull or InputBufferFull error, but got {} events: {:?}", + events.len(), + events + ); +} + +#[test] +fn test_input_buffer_full_with_extremely_long_token() { + // Test with an extremely long single token that exceeds reasonable input buffer limits + let extremely_long_key = "k".repeat(50000); + let json = format!(r#"{{"{key}": "value"}}"#, key = extremely_long_key); + + let mut buffer = [0u8; 64]; // Very small buffer + let reader = LargeDataReader::new(&json, 32); + + let mut parser = StreamParser::new(reader, &mut buffer); + + match parser.next_event() { + Ok(_) => { + // Continue parsing to see what happens + loop { + match parser.next_event() { + Ok(event) => { + if matches!(event, picojson::Event::EndDocument) { + break; + } + } + Err(e) => { + if matches!( + e, + ParseError::InputBufferFull | ParseError::ScratchBufferFull + ) { + // This is an expected error for extremely long tokens. + return; + } + panic!("Unexpected error for extremely long token: {:?}", e); + } + } + } + } + Err(e) => { + match e { + ParseError::ScratchBufferFull | ParseError::InputBufferFull => { + // This is an expected error for extremely long tokens. + } + _ => { + panic!( + "Unexpected error on first event for extremely long token: {:?}", + e + ); + } + } + } + } +} diff --git a/picojson/tests/json_checker_tests.rs b/picojson/tests/json_checker_tests.rs index 5de0d15..299c31c 100644 --- a/picojson/tests/json_checker_tests.rs +++ b/picojson/tests/json_checker_tests.rs @@ -14,7 +14,10 @@ #[cfg(feature = "remote-tests")] mod json_checker_tests { - use picojson::{Event, ParseError, PullParser, SliceParser}; + use picojson::{ + ChunkReader, DefaultConfig, Event, ParseError, PullParser, PushParseError, PushParser, + PushParserHandler, SliceParser, StreamParser, + }; use std::fs; use std::path::Path; @@ -33,6 +36,54 @@ mod json_checker_tests { Ok(event_count) } + // Test handler for PushParser conformance tests + struct ConformanceTestHandler { + event_count: usize, + } + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for ConformanceTestHandler { + fn handle_event(&mut self, _event: Event<'a, 'b>) -> Result<(), ParseError> { + self.event_count += 1; + Ok(()) + } + } + + fn run_push_parser_test(json_content: &str) -> Result { + let mut buffer = [0u8; 2048]; // Larger buffer for pass1.json + let handler = ConformanceTestHandler { event_count: 0 }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + let to_parse_error = |e: PushParseError| match e { + PushParseError::Parse(parse_err) => parse_err, + PushParseError::Handler(handler_err) => handler_err, + }; + + parser + .write(json_content.as_bytes()) + .map_err(to_parse_error)?; + + let handler = parser.finish::().map_err(to_parse_error)?; + Ok(handler.event_count) + } + + fn run_stream_parser_test(json_content: &str) -> Result { + let reader = ChunkReader::full_slice(json_content.as_bytes()); + let mut buffer = [0u8; 2048]; // Larger buffer for pass1.json + let mut parser = StreamParser::<_, DefaultConfig>::new(reader, &mut buffer); + let mut event_count = 0; + + loop { + match parser.next_event() { + Ok(Event::EndDocument) => break, + Ok(_event) => { + event_count += 1; + } + Err(e) => return Err(e), + } + } + Ok(event_count) + } + fn load_test_file(filename: &str) -> String { let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| ".".to_string()); let path = Path::new(&manifest_dir) @@ -85,6 +136,90 @@ mod json_checker_tests { result.err() ); } + + // PushParser conformance tests + #[test] + fn test_push_parser_pass1_comprehensive() { + let content = load_test_file("pass1.json"); + let result = run_push_parser_test(&content); + assert!( + result.is_ok(), + "PushParser: pass1.json should parse successfully but failed: {:?}", + result.err() + ); + + // pass1.json is a comprehensive test with many JSON features + let event_count = result.unwrap(); + assert!( + event_count > 50, + "PushParser: pass1.json should generate substantial events, got: {}", + event_count + ); + } + + #[test] + fn test_push_parser_pass2_deep_nesting() { + let content = load_test_file("pass2.json"); + let result = run_push_parser_test(&content); + assert!( + result.is_ok(), + "PushParser: pass2.json (deep nesting) should parse successfully but failed: {:?}", + result.err() + ); + } + + #[test] + fn test_push_parser_pass3_simple_object() { + let content = load_test_file("pass3.json"); + let result = run_push_parser_test(&content); + assert!( + result.is_ok(), + "PushParser: pass3.json (simple object) should parse successfully but failed: {:?}", + result.err() + ); + } + + // StreamParser conformance tests with logging + #[test] + fn test_stream_parser_pass1_comprehensive() { + let content = load_test_file("pass1.json"); + let result = run_stream_parser_test(&content); + assert!( + result.is_ok(), + "StreamParser: pass1.json should parse successfully but failed: {:?}", + result.err() + ); + + // pass1.json is a comprehensive test with many JSON features + let event_count = result.unwrap(); + assert!( + event_count > 50, + "StreamParser: pass1.json should generate substantial events, got: {}", + event_count + ); + } + + #[test] + fn test_stream_parser_pass2_deep_nesting() { + let content = load_test_file("pass2.json"); + let result = run_stream_parser_test(&content); + assert!( + result.is_ok(), + "StreamParser: pass2.json (deep nesting) should parse successfully but failed: {:?}", + result.err() + ); + } + + #[test] + fn test_stream_parser_pass3_simple_object() { + let content = load_test_file("pass3.json"); + let result = run_stream_parser_test(&content); + assert!( + result.is_ok(), + "StreamParser: pass3.json (simple object) should parse successfully but failed: {:?}", + result.err() + ); + } } // Indices of fail*.json files that should fail to parse (excluding known deviations) @@ -122,6 +257,33 @@ mod json_checker_tests { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33 ); + + macro_rules! generate_push_parser_fail_tests { + ($($num:expr),*) => { + $( + paste::paste! { + #[test] + fn []() { + let content = load_test_file(&format!("fail{}.json", $num)); + let result = run_push_parser_test(&content); + assert!( + result.is_err(), + "PushParser: fail{}.json should fail to parse but succeeded with {} events. Content: {:?}", + $num, + result.unwrap_or(0), + content + ); + } + } + )* + }; + } + + // Generate PushParser test cases for the same 31 fail*.json files + generate_push_parser_fail_tests!( + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33 + ); } mod known_deviations { @@ -146,6 +308,27 @@ mod json_checker_tests { "fail18.json is expected to pass because the non-recursive parser handles deep nesting." ); } + + // PushParser known deviations - should match SliceParser behavior + #[test] + fn test_push_parser_fail1_root_string_allowed() { + let content = load_test_file("fail1.json"); + let result = run_push_parser_test(&content); + assert!( + result.is_ok(), + "PushParser: fail1.json is expected to pass because modern JSON (RFC 7159) allows scalar root values." + ); + } + + #[test] + fn test_push_parser_fail18_deep_nesting_supported() { + let content = load_test_file("fail18.json"); + let result = run_push_parser_test(&content); + assert!( + result.is_ok(), + "PushParser: fail18.json is expected to pass because the non-recursive parser handles deep nesting." + ); + } } #[test] diff --git a/picojson/tests/push_parser.rs b/picojson/tests/push_parser.rs new file mode 100644 index 0000000..18665cc --- /dev/null +++ b/picojson/tests/push_parser.rs @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: Apache-2.0 + +// Push parser tests for the integrated escape handling functionality +#[cfg(test)] +mod tests { + use picojson::{ + DefaultConfig, Event, ParseError, PullParser, PushParser, PushParserHandler, SliceParser, + }; + + // Simple test handler for the clean implementation + struct SimpleHandler; + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for SimpleHandler { + fn handle_event(&mut self, _event: Event<'a, 'b>) -> Result<(), ParseError> { + Ok(()) + } + } + + #[test] + fn test_clean_push_parser_compiles() { + let mut buffer = [0u8; 256]; + let handler = SimpleHandler; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // This should compile without lifetime issues using HRTB + tokenizer + event array + parser.write(b"true").unwrap(); // Valid JSON + let _handler = parser.finish::().unwrap(); + } + + #[test] + fn test_hrtb_pattern_with_scratch_buffer() { + // Handler that captures events to verify HRTB works + struct CapturingHandler { + event_count: usize, + } + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for CapturingHandler { + fn handle_event(&mut self, event: Event<'a, 'b>) -> Result<(), ParseError> { + self.event_count += 1; + match event { + Event::String(s) => { + // Both String::Borrowed and String::Unescaped should work + assert_eq!(s.as_ref(), "hello"); // From input or StreamBuffer via HRTB! + } + Event::EndDocument => { + // Expected + } + _ => panic!("Unexpected event: {:?}", event), + } + Ok(()) + } + } + + let mut buffer = [0u8; 256]; + let handler = CapturingHandler { event_count: 0 }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test tokenizer + HRTB integration with real JSON + parser.write(b"\"hello\"").unwrap(); // This should trigger String Begin event -> Unescaped processing + let handler = parser.finish::().unwrap(); + + // Verify events were processed + assert_eq!(handler.event_count, 2); // String + EndDocument + } + + #[test] + fn test_string_borrowed() { + // Handler that captures strings for verification + struct StringHandler { + string_content: Option, // Use std::string::String to avoid lifetime issues + } + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for StringHandler { + fn handle_event(&mut self, event: Event<'a, 'b>) -> Result<(), ParseError> { + match event { + Event::String(s) => { + // Capture the actual string content for verification + self.string_content = Some(s.as_ref().to_owned()); + Ok(()) + } + Event::EndDocument => Ok(()), + _ => Ok(()), // Ignore other events + } + } + } + + let mut buffer = [0u8; 256]; + let handler = StringHandler { + string_content: None, + }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test simple string extraction - this should extract "test" from the input + parser.write(br#""test""#).unwrap(); + let handler = parser.finish::().unwrap(); + + // SUCCESS: Verify we extracted the actual string content! + assert_eq!( + handler.string_content, + Some("test".to_owned()), + "Should extract 'test' from input \"test\"" + ); + } + + #[test] + fn test_keys() { + // Debug handler that captures ALL events including keys + struct KeyTestHandler { + events: Vec, + } + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for KeyTestHandler { + fn handle_event(&mut self, event: Event<'a, 'b>) -> Result<(), ParseError> { + let event_desc = match event { + Event::StartObject => "StartObject".to_string(), + Event::EndObject => "EndObject".to_string(), + Event::Key(k) => format!("Key({})", k.as_ref()), + Event::String(s) => format!("String({})", s.as_ref()), + Event::Bool(b) => format!("Bool({})", b), + Event::EndDocument => "EndDocument".to_string(), + _ => "Other".to_string(), + }; + self.events.push(event_desc); + Ok(()) + } + } + + let mut buffer = [0u8; 256]; + let handler = KeyTestHandler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test object with key-value pair + parser.write(br#"{"name": "value"}"#).unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify we captured all object events correctly + + // Should see: [StartObject, Key(name), String(value), EndObject, EndDocument] + assert_eq!( + handler.events, + vec![ + "StartObject".to_string(), + "Key(name)".to_string(), + "String(value)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string() + ] + ); + } + + #[test] + fn test_simple_escapes() { + // Debug handler that captures strings and keys to test escape processing + struct EscapeTestHandler { + events: Vec, + } + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for EscapeTestHandler { + fn handle_event(&mut self, event: Event<'a, 'b>) -> Result<(), ParseError> { + let event_desc = match event { + Event::StartObject => "StartObject".to_string(), + Event::EndObject => "EndObject".to_string(), + Event::Key(k) => format!("Key({})", k.as_ref()), + Event::String(s) => format!("String({})", s.as_ref()), + Event::EndDocument => "EndDocument".to_string(), + _ => "Other".to_string(), + }; + self.events.push(event_desc); + Ok(()) + } + } + + let mut buffer = [0u8; 256]; + let handler = EscapeTestHandler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test string with actual escape sequence (\n should become newline) + parser.write(b"{\"key\": \"hello\\nworld\"}").unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify escape sequence was processed correctly + + // Should see the escaped newline processed correctly + assert_eq!( + handler.events, + vec![ + "StartObject".to_string(), + "Key(key)".to_string(), + "String(hello\nworld)".to_string(), // \n in JSON becomes actual newline character + "EndObject".to_string(), + "EndDocument".to_string() + ] + ); + } + + #[test] + fn test_unicode_escapes() { + // Debug handler that captures strings and keys to test Unicode escape processing + struct UnicodeEscapeTestHandler { + events: Vec, + } + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for UnicodeEscapeTestHandler { + fn handle_event(&mut self, event: Event<'a, 'b>) -> Result<(), ParseError> { + let event_desc = match event { + Event::StartObject => "StartObject".to_string(), + Event::EndObject => "EndObject".to_string(), + Event::Key(k) => format!("Key({})", k.as_ref()), + Event::String(s) => format!("String({})", s.as_ref()), + Event::EndDocument => "EndDocument".to_string(), + _ => "Other".to_string(), + }; + self.events.push(event_desc); + Ok(()) + } + } + + let mut buffer = [0u8; 256]; + let handler = UnicodeEscapeTestHandler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test string with Unicode escape sequence (\u0041 should become 'A') + parser.write(br#"{"key": "\u0041"}"#).unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify Unicode escape sequence was processed correctly + + // Should see the Unicode escape processed correctly: \u0041 → A + assert_eq!( + handler.events, + vec![ + "StartObject".to_string(), + "Key(key)".to_string(), + "String(A)".to_string(), // \u0041 should be converted to 'A' + "EndObject".to_string(), + "EndDocument".to_string() + ] + ); + } + + #[test] + fn test_consecutive_unicode_escapes() { + // Debug handler that captures strings and keys to test consecutive Unicode escapes + struct ConsecutiveUnicodeTestHandler { + events: Vec, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> + for ConsecutiveUnicodeTestHandler + { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::StartObject => self.events.push("StartObject".to_string()), + Event::EndObject => self.events.push("EndObject".to_string()), + Event::Key(key) => self.events.push(format!("Key({})", key)), + Event::String(s) => self.events.push(format!("String({})", s)), + Event::EndDocument => self.events.push("EndDocument".to_string()), + _ => {} + } + Ok(()) + } + } + + let mut buffer = [0u8; 256]; + let handler = ConsecutiveUnicodeTestHandler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test string with mixed escapes like in pass1.json line 45 + parser.write(br#"{"key": "\uCAFE\uBABE"}"#).unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify consecutive Unicode escapes were processed correctly + + // Should see both Unicode escapes processed correctly + assert_eq!( + handler.events, + vec![ + "StartObject".to_string(), + "Key(key)".to_string(), + "String(쫾몾)".to_string(), // \uCAFE\uBABE should be decoded to consecutive Unicode characters + "EndObject".to_string(), + "EndDocument".to_string() + ] + ); + } + + // Debug test for tracing PushParser with pass1.json problematic lines + #[test] + fn test_push_parser_pass1_specific_lines() { + struct TraceHandler { + events: Vec, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for TraceHandler { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::String(s) => { + self.events.push(format!("String({})", s.as_ref())); + } + Event::Key(key) => { + self.events.push(format!("Key({})", key.as_ref())); + } + _ => {} + } + Ok(()) + } + } + + // Test line 28 from pass1.json first + let line_28 = r#"{"hex": "\\u0123\\u4567\\u89AB\\uCDEF\\uabcd\\uef4A"}"#; + + let mut buffer = [0u8; 1024]; + let handler = TraceHandler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + assert_eq!(parser.write(line_28.as_bytes()), Ok(())); + assert!(parser.finish::().is_ok()); + + // Test line 45 from pass1.json (the longer one we tested before) + let line_45 = r#""\\/\\\\\\\"\\uCAFE\\uBABE\\uAB98\\uFCDE\\ubcda\\uef4A\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?""#; + + let mut buffer2 = [0u8; 1024]; + let handler2 = TraceHandler { events: Vec::new() }; + let mut parser2 = PushParser::<_, DefaultConfig>::new(handler2, &mut buffer2); + + assert_eq!(parser2.write(line_45.as_bytes()), Ok(())); + assert!(parser2.finish::().is_ok()); + } + + // Test larger section of pass1.json to find what causes InvalidSliceBounds + #[test] + fn test_push_parser_pass1_larger_section() { + struct TraceHandler { + events: Vec, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for TraceHandler { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::String(s) => { + self.events + .push(format!("String({} chars)", s.as_ref().len())); + } + Event::Key(key) => { + self.events.push(format!("Key({})", key.as_ref())); + } + _ => {} + } + Ok(()) + } + } + + // Test a larger section from pass1.json that includes the problematic areas + #[cfg(feature = "float")] + let larger_section = r#"{ + "integer": 1234567890, + "real": -9876.543210, + "e": 0.123456789e-12, + "E": 1.234567890E+34, + "": 23456789012E66, + "zero": 0, + "one": 1, + "space": " ", + "quote": "\"", + "backslash": "\\", + "controls": "\\b\\f\\n\\r\\t", + "slash": "/ & \/", + "alpha": "abcdefghijklmnopqrstuvwyz", + "ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ", + "digit": "0123456789", + "0123456789": "digit", + "special": "`1~!@#$%^&*()_+-={':[,]}|;.?", + "hex": "\\u0123\\u4567\\u89AB\\uCDEF\\uabcd\\uef4A", + "true": true, + "false": false, + "null": null + }"#; + + #[cfg(not(feature = "float"))] + let larger_section = r#"{ + "integer": 1234567890, + "zero": 0, + "one": 1, + "space": " ", + "quote": "\"", + "backslash": "\\", + "controls": "\\b\\f\\n\\r\\t", + "slash": "/ & \/", + "alpha": "abcdefghijklmnopqrstuvwyz", + "ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ", + "digit": "0123456789", + "0123456789": "digit", + "special": "`1~!@#$%^&*()_+-={':[,]}|;.?", + "hex": "\\u0123\\u4567\\u89AB\\uCDEF\\uabcd\\uef4A", + "true": true, + "false": false, + "null": null + }"#; + + let mut buffer = [0u8; 2048]; // Larger buffer + let handler = TraceHandler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + assert_eq!(parser.write(larger_section.as_bytes()), Ok(())); + assert!(parser.finish::().is_ok()); + } + + // Test how parsers handle empty keys like in pass1.json + #[test] + fn test_empty_key_handling() { + // Test the exact pattern from pass1.json line 15 + let empty_key_json = r#"{"": 123}"#; + + // Test SliceParser first + let mut buffer = [0u8; 256]; + let mut slice_parser = SliceParser::with_buffer(empty_key_json, &mut buffer); + + match slice_parser.next_event() { + Ok(Event::StartObject) => {} + other => panic!("Expected StartObject event, got {:?}", other), + } + + match slice_parser.next_event() { + Ok(Event::Key(k)) => assert_eq!(k.as_ref(), "", "Empty key should be empty string"), + other => panic!("Expected Key event, got {:?}", other), + } + + // Test PushParser + + struct EmptyKeyHandler { + events: Vec, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for EmptyKeyHandler { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::Key(k) => { + self.events.push(format!("Key({})", k.as_ref())); + } + Event::Number(n) => { + self.events.push(format!("Number({})", n.as_str())); + } + _ => {} + } + Ok(()) + } + } + + let mut buffer2 = [0u8; 256]; + let handler = EmptyKeyHandler { events: Vec::new() }; + let mut push_parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer2); + + assert_eq!(push_parser.write(empty_key_json.as_bytes()), Ok(())); + + let handler = push_parser.finish::().unwrap(); + assert_eq!( + handler.events, + vec!["Key()".to_string(), "Number(123)".to_string()], + "PushParser should capture empty key and number value" + ); + } + + #[test] + fn test_numbers() { + // Debug handler that captures numbers to test number processing + struct NumberTestHandler { + events: Vec, + } + + impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for NumberTestHandler { + fn handle_event(&mut self, event: Event<'a, 'b>) -> Result<(), ParseError> { + let event_desc = match event { + Event::StartArray => "StartArray".to_string(), + Event::EndArray => "EndArray".to_string(), + Event::StartObject => "StartObject".to_string(), + Event::EndObject => "EndObject".to_string(), + Event::Key(k) => format!("Key({})", k.as_ref()), + Event::String(s) => format!("String({})", s.as_ref()), + Event::Number(n) => format!("Number({})", n.as_str()), + Event::Bool(b) => format!("Bool({})", b), + Event::Null => "Null".to_string(), + Event::EndDocument => "EndDocument".to_string(), + }; + self.events.push(event_desc); + Ok(()) + } + } + + let mut buffer = [0u8; 256]; + let handler = NumberTestHandler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test object with various number types + #[cfg(feature = "float")] + let json_input = br#"{"int": 42, "float": 3.14, "negative": -123}"#; + #[cfg(not(feature = "float"))] + let json_input = br#"{"int": 42, "negative": -123}"#; + + parser.write(json_input).unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify number events were captured correctly + + // Should see all number types processed correctly + #[cfg(feature = "float")] + let expected = vec![ + "StartObject".to_string(), + "Key(int)".to_string(), + "Number(42)".to_string(), + "Key(float)".to_string(), + "Number(3.14)".to_string(), + "Key(negative)".to_string(), + "Number(-123)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string(), + ]; + + #[cfg(not(feature = "float"))] + let expected = vec![ + "StartObject".to_string(), + "Key(int)".to_string(), + "Number(42)".to_string(), + "Key(negative)".to_string(), + "Number(-123)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string(), + ]; + + assert_eq!(handler.events, expected); + } + + #[test] + fn test_single_slash_escape() { + struct Handler { + events: Vec, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for Handler { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::String(s) => self.events.push(format!("String({})", s)), + _ => {} + } + Ok(()) + } + } + + let mut buffer = [0u8; 64]; + let handler = Handler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test just \/ + parser.write(br#""\/""#).unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify single slash escape was processed correctly + // Should be: ["String(/)"] + assert_eq!(handler.events, vec!["String(/)".to_string()]); + } + + #[test] + fn test_invalid_unicode_escape_incomplete() { + struct Handler; + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for Handler { + fn handle_event( + &mut self, + _event: picojson::Event<'input, 'scratch>, + ) -> Result<(), ParseError> { + Ok(()) + } + } + + let mut buffer = [0u8; 64]; + let handler = Handler; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test incomplete Unicode escape (missing hex digits) + let write_result = parser.write(br#""\u004""#); + if write_result.is_ok() { + // If write succeeds, the error should be caught in finish + let finish_result = parser.finish::(); + assert!( + finish_result.is_err(), + "Incomplete Unicode escape should fail during finish" + ); + } else { + // If write fails, that's also acceptable for incomplete escape + assert!( + write_result.is_err(), + "Incomplete Unicode escape should fail" + ); + } + } + + #[test] + fn test_invalid_unicode_escape_invalid_hex() { + struct Handler; + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for Handler { + fn handle_event( + &mut self, + _event: picojson::Event<'input, 'scratch>, + ) -> Result<(), ParseError> { + Ok(()) + } + } + + let mut buffer = [0u8; 64]; + let handler = Handler; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test invalid hex character in Unicode escape + let result = parser.write(br#""\u004G""#); + assert!( + result.is_err(), + "Invalid hex character in Unicode escape should fail" + ); + } + + #[test] + fn test_invalid_unicode_escape_in_key() { + struct Handler; + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for Handler { + fn handle_event( + &mut self, + _event: picojson::Event<'input, 'scratch>, + ) -> Result<(), ParseError> { + Ok(()) + } + } + + let mut buffer = [0u8; 64]; + let handler = Handler; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test invalid Unicode escape in object key + let result = parser.write(br#"{"\u004Z": "value"}"#); + assert!(result.is_err(), "Invalid Unicode escape in key should fail"); + } + + #[test] + fn test_mixed_borrowed_and_unescaped_strings() { + struct Handler { + events: Vec, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for Handler { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::String(s) => { + let content = s.as_ref().to_string(); + let string_type = match s { + picojson::String::Borrowed(_) => "Borrowed", + picojson::String::Unescaped(_) => "Unescaped", + }; + self.events.push(format!("{}({})", string_type, content)); + } + Event::Key(k) => { + let content = k.as_ref().to_string(); + let key_type = match k { + picojson::String::Borrowed(_) => "BorrowedKey", + picojson::String::Unescaped(_) => "UnescapedKey", + }; + self.events.push(format!("{}({})", key_type, content)); + } + _ => {} + } + Ok(()) + } + } + + let mut buffer = [0u8; 256]; + let handler = Handler { events: Vec::new() }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test object with both borrowed (simple) and unescaped (with escapes) strings + parser + .write(br#"{"simple": "value", "escaped": "hello\\nworld"}"#) + .unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify we have both borrowed and unescaped string types + let has_borrowed = handler.events.iter().any(|e| e.starts_with("Borrowed")); + let has_unescaped = handler.events.iter().any(|e| e.starts_with("Unescaped")); + + assert!(has_borrowed, "Should have at least one borrowed string"); + assert!(has_unescaped, "Should have at least one unescaped string"); + } + + #[test] + fn test_invalid_escape_sequences_in_keys() { + struct Handler; + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for Handler { + fn handle_event( + &mut self, + _event: picojson::Event<'input, 'scratch>, + ) -> Result<(), ParseError> { + Ok(()) + } + } + + let mut buffer = [0u8; 64]; + let handler = Handler; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test invalid escape sequence in object key (\x is not valid JSON) + let result = parser.write(br#"{"\x41": "value"}"#); + assert!( + result.is_err(), + "Invalid escape sequence in key should fail" + ); + } +} diff --git a/picojson/tests/push_parser_copy_on_escape.rs b/picojson/tests/push_parser_copy_on_escape.rs new file mode 100644 index 0000000..657b7e6 --- /dev/null +++ b/picojson/tests/push_parser_copy_on_escape.rs @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Test for PushParser copy-on-escape optimization (no_std compliant) + +use picojson::{DefaultConfig, Event, ParseError, PushParser, PushParserHandler, String}; + +#[test] +fn test_borrowed_vs_unescaped_simple() { + // Test simple case: both strings should be borrowed (no escapes) + struct SimpleHandler { + key_is_borrowed: Option, + value_is_borrowed: Option, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for SimpleHandler { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::Key(s) => { + self.key_is_borrowed = Some(matches!(s, String::Borrowed(_))); + } + Event::String(s) => { + self.value_is_borrowed = Some(matches!(s, String::Borrowed(_))); + } + _ => {} + } + Ok(()) + } + } + + let mut buffer = [0u8; 1024]; + let handler = SimpleHandler { + key_is_borrowed: None, + value_is_borrowed: None, + }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + parser.write(br#"{"foo": "bar"}"#).unwrap(); + + let handler = parser.finish::().unwrap(); + + // Both should be borrowed since no escapes + assert_eq!( + handler.key_is_borrowed, + Some(true), + "Key 'foo' should be String::Borrowed" + ); + assert_eq!( + handler.value_is_borrowed, + Some(true), + "Value 'bar' should be String::Borrowed" + ); +} + +#[test] +fn test_borrowed_vs_unescaped_with_escapes() { + // Test with escapes: should be unescaped + struct EscapeHandler { + key_is_borrowed: Option, + value_is_borrowed: Option, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for EscapeHandler { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::Key(s) => { + self.key_is_borrowed = Some(matches!(s, String::Borrowed(_))); + } + Event::String(s) => { + self.value_is_borrowed = Some(matches!(s, String::Borrowed(_))); + } + _ => {} + } + Ok(()) + } + } + + let mut buffer = [0u8; 1024]; + let handler = EscapeHandler { + key_is_borrowed: None, + value_is_borrowed: None, + }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + parser.write(br#"{"key\\n": "val\\t"}"#).unwrap(); + let handler = parser.finish::().unwrap(); + + // Both should be unescaped since they have escape sequences + assert_eq!( + handler.key_is_borrowed, + Some(false), + "Key with escape should be String::Unescaped" + ); + assert_eq!( + handler.value_is_borrowed, + Some(false), + "Value with escape should be String::Unescaped" + ); +} + +#[test] +fn test_buffer_isolation() { + // Test that strings don't accumulate content from previous strings + struct ContentChecker { + first_string: Option<[u8; 32]>, + first_len: usize, + second_string: Option<[u8; 32]>, + second_len: usize, + count: usize, + } + + impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for ContentChecker { + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + match event { + Event::Key(s) | Event::String(s) => { + let bytes = s.as_ref().as_bytes(); + if self.count == 0 { + // First string + let mut buf = [0u8; 32]; + let len = bytes.len().min(32); + buf[..len].copy_from_slice(&bytes[..len]); + self.first_string = Some(buf); + self.first_len = len; + } else if self.count == 1 { + // Second string + let mut buf = [0u8; 32]; + let len = bytes.len().min(32); + buf[..len].copy_from_slice(&bytes[..len]); + self.second_string = Some(buf); + self.second_len = len; + } + self.count += 1; + } + _ => {} + } + Ok(()) + } + } + + let mut buffer = [0u8; 1024]; + let handler = ContentChecker { + first_string: None, + first_len: 0, + second_string: None, + second_len: 0, + count: 0, + }; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Test: simple string followed by escaped string + parser.write(br#"{"simple": "esc\\n"}"#).unwrap(); + let handler = parser.finish::().unwrap(); + + // Verify first string is "simple" + assert!(handler.first_string.is_some()); + let first = &handler.first_string.unwrap()[..handler.first_len]; + assert_eq!(first, b"simple", "First string should be 'simple'"); + + // Verify second string: JSON "esc\\n" (double backslash) becomes "esc\n" (single backslash + n) + // This is correct behavior - double backslash in JSON becomes single backslash in string + assert!(handler.second_string.is_some()); + let second = &handler.second_string.unwrap()[..handler.second_len]; + assert_eq!( + second, b"esc\\n", + "JSON \"esc\\\\n\" should become string \"esc\\n\" (literal backslash + n, not newline)" + ); +} diff --git a/picojson/tests/push_parser_escapes.rs b/picojson/tests/push_parser_escapes.rs new file mode 100644 index 0000000..66459c6 --- /dev/null +++ b/picojson/tests/push_parser_escapes.rs @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: Apache-2.0 + +use picojson::{DefaultConfig, Event, ParseError, PushParser, PushParserHandler}; + +/// Simple test handler that collects events as debug strings +struct EventCollector { + events: Vec, +} + +impl EventCollector { + fn new() -> Self { + Self { events: Vec::new() } + } +} + +impl<'a, 'b> PushParserHandler<'a, 'b, ParseError> for EventCollector { + fn handle_event(&mut self, event: Event<'a, 'b>) -> Result<(), ParseError> { + let event_desc = match event { + Event::StartObject => "StartObject".to_string(), + Event::EndObject => "EndObject".to_string(), + Event::StartArray => "StartArray".to_string(), + Event::EndArray => "EndArray".to_string(), + Event::Bool(b) => format!("Bool({})", b), + Event::Null => "Null".to_string(), + Event::EndDocument => "EndDocument".to_string(), + Event::Key(k) => format!("Key({})", k.as_ref()), + Event::String(s) => format!("String({})", s.as_ref()), + Event::Number(n) => format!("Number({})", n.as_str()), + }; + self.events.push(event_desc); + Ok(()) + } +} + +#[test] +fn test_string_with_actual_escapes() { + // Test that escape sequences in strings are properly processed + let json_string = "{\"message\": \"Hello\\nWorld\\t!\"}"; + let json = json_string.as_bytes(); + + let handler = EventCollector::new(); + let mut buffer = [0u8; 256]; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + parser.write(json).unwrap(); + let handler = parser.finish::().unwrap(); + + let expected = vec![ + "StartObject".to_string(), + "Key(message)".to_string(), + // Escape sequences \\n and \\t should be converted to actual newline and tab + "String(Hello\nWorld\t!)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string(), + ]; + + assert_eq!(handler.events, expected); +} + +#[test] +fn test_quote_escape() { + // Test with a quote escape sequence + let json_string = r#"{"test": "quote\"here"}"#; + let json = json_string.as_bytes(); + + let handler = EventCollector::new(); + let mut buffer = [0u8; 256]; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + parser.write(json).unwrap(); + let handler = parser.finish::().unwrap(); + + let expected = vec![ + "StartObject".to_string(), + "Key(test)".to_string(), + // The \" should be converted to an actual quote character + "String(quote\"here)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string(), + ]; + + assert_eq!(handler.events, expected); +} + +#[test] +fn test_escaped_key_with_newline() { + // Test key with literal backslash-n characters (not escape sequence) + let json_string = r#"{"ke\\ny": "value"}"#; + let json = json_string.as_bytes(); + + let handler = EventCollector::new(); + let mut buffer = [0u8; 256]; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + parser.write(json).unwrap(); + let handler = parser.finish::().unwrap(); + + let expected = vec![ + "StartObject".to_string(), + // This key contains literal backslash+n chars (not escape sequence) - correct behavior + "Key(ke\\ny)".to_string(), + "String(value)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string(), + ]; + + assert_eq!(handler.events, expected); +} + +#[test] +fn test_actual_key_escape_sequence() { + // Test key with ACTUAL escape sequence: \n becomes newline character + let json_string = r#"{"ke\ny": "value"}"#; // JSON with actual \n escape sequence + let json = json_string.as_bytes(); + + let handler = EventCollector::new(); + let mut buffer = [0u8; 256]; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + parser.write(json).unwrap(); + let handler = parser.finish::().unwrap(); + + let expected = vec![ + "StartObject".to_string(), + // Key escape processing should convert \n to actual newline + "Key(ke\ny)".to_string(), + "String(value)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string(), + ]; + + assert_eq!(handler.events, expected); +} + +#[test] +fn test_unicode_escapes() { + // Test that Unicode escape sequences are properly decoded + let json = br#"["\u0041\u0042\u0043"]"#; + + let mut buffer = [0u8; 64]; + let handler = EventCollector::new(); + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + parser.write(json).unwrap(); + let handler = parser.finish::().unwrap(); + + let expected = vec![ + "StartArray".to_string(), + "String(ABC)".to_string(), // \u0041\u0042\u0043 should decode to ABC + "EndArray".to_string(), + "EndDocument".to_string(), + ]; + + assert_eq!(handler.events, expected); +} + +#[test] +fn test_escaped_key_with_quote() { + // Test key with quote escape - key "quo\"te" with value "data" + let json_string = r#"{"quo\"te": "data"}"#; + let json = json_string.as_bytes(); + + let handler = EventCollector::new(); + let mut buffer = [0u8; 256]; + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + parser.write(json).unwrap(); + let handler = parser.finish::().unwrap(); + + let expected = vec![ + "StartObject".to_string(), + // Key with quote escape should be processed correctly + "Key(quo\"te)".to_string(), + "String(data)".to_string(), + "EndObject".to_string(), + "EndDocument".to_string(), + ]; + + assert_eq!(handler.events, expected); +} diff --git a/picojson/tests/push_parser_invalidslicebounds_repro.rs b/picojson/tests/push_parser_invalidslicebounds_repro.rs new file mode 100644 index 0000000..14802b2 --- /dev/null +++ b/picojson/tests/push_parser_invalidslicebounds_repro.rs @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal reproduction test for InvalidSliceBounds buffer boundary tracking issue +//! This test aims to reproduce the exact same error that occurs in pass1.json parsing + +use picojson::{DefaultConfig, Event, ParseError, PushParser, PushParserHandler}; + +/// Handler that compares events immediately as they arrive for detailed validation +struct ReproHandler<'expected> { + expected_events: &'expected [&'expected str], + current_index: usize, +} + +impl<'expected> ReproHandler<'expected> { + fn new(expected_events: &'expected [&'expected str]) -> Self { + Self { + expected_events, + current_index: 0, + } + } + + fn assert_complete(&self) { + assert_eq!( + self.current_index, + self.expected_events.len(), + "Expected {} events, but only received {}", + self.expected_events.len(), + self.current_index + ); + } + + fn assert_event_matches(&mut self, received: &Event) { + assert!( + self.current_index < self.expected_events.len(), + "Received more events than expected. Got event at index {} but only expected {} events total", + self.current_index, + self.expected_events.len() + ); + + let expected_str = self.expected_events[self.current_index]; + let received_str = self.event_to_string(received); + + assert_eq!( + expected_str, received_str, + "Event mismatch at index {}", + self.current_index + ); + + self.current_index += 1; + } + + fn event_to_string(&self, event: &Event) -> String { + match event { + Event::StartObject => "StartObject".to_string(), + Event::EndObject => "EndObject".to_string(), + Event::StartArray => "StartArray".to_string(), + Event::EndArray => "EndArray".to_string(), + Event::Key(k) => format!("Key({})", k.as_str()), + Event::String(s) => format!("String({})", s.as_str()), + Event::Number(n) => format!("Number({})", n.as_str()), + Event::Bool(b) => format!("Bool({})", b), + Event::Null => "Null".to_string(), + Event::EndDocument => "EndDocument".to_string(), + } + } +} + +impl<'input, 'scratch, 'expected> PushParserHandler<'input, 'scratch, ParseError> + for ReproHandler<'expected> +{ + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + self.assert_event_matches(&event); + Ok(()) + } +} + +#[test] +fn test_reproduce_invalidslicebounds_minimal() { + // Test parsing JSON with Unicode escapes to ensure no InvalidSliceBounds errors + let json_content = br#"{"hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A"}"#; + + // Use a small buffer that might trigger boundary issues + let mut buffer = [0u8; 128]; + + // Define expected events with properly decoded Unicode escapes + let expected_events = [ + "StartObject", + "Key(hex)", + "String(ģ䕧覫췯ꯍ\u{ef4a})", // Unicode escapes properly decoded to characters + "EndObject", + "EndDocument", + ]; + + let handler = ReproHandler::new(&expected_events); + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Should parse successfully without InvalidSliceBounds error + parser.write(json_content).expect("Write should succeed"); + let handler = parser + .finish::() + .expect("Finish should succeed"); + + // Verify all expected events were received + handler.assert_complete(); +} + +#[test] +fn test_reproduce_invalidslicebounds_chunked() { + // Test the same content in small chunks to trigger buffer boundary issues + let json_content = br#"{"hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A"}"#; + + // Use a buffer large enough for the content but small enough to test chunking + let mut buffer = [0u8; 128]; + + // Define expected events (same as previous test) + let expected_events = [ + "StartObject", + "Key(hex)", + "String(ģ䕧覫췯ꯍ\u{ef4a})", + "EndObject", + "EndDocument", + ]; + + let handler = ReproHandler::new(&expected_events); + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Write in small chunks to stress boundary handling + let chunk_size = 8; + for chunk in json_content.chunks(chunk_size) { + parser + .write(chunk) + .expect("Each chunk should parse successfully"); + } + + let handler = parser + .finish::() + .expect("Finish should succeed"); + + // Verify all expected events were received + handler.assert_complete(); +} + +#[test] +fn test_reproduce_invalidslicebounds_complex_key() { + // Test complex key with mixed escapes from pass1.json + let json_content = br#"{"\\\/\\\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t": "value"}"#; + + // Use a small buffer to stress boundary handling + let mut buffer = [0u8; 128]; + + // Define expected events with complex key containing decoded escape sequences + let expected_events = [ + "StartObject", + "Key(\\/\\\\\"쫾몾ꮘﳞ볚\u{ef4a}\u{8}\u{c}\n\r\t)", // Complex key with decoded escapes + "String(value)", + "EndObject", + "EndDocument", + ]; + + let handler = ReproHandler::new(&expected_events); + let mut parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + // Should parse successfully without InvalidSliceBounds error + parser.write(json_content).expect("Write should succeed"); + let handler = parser + .finish::() + .expect("Finish should succeed"); + + // Verify all expected events were received with proper escape sequence decoding + handler.assert_complete(); +} diff --git a/picojson/tests/push_parser_stress_test.rs b/picojson/tests/push_parser_stress_test.rs new file mode 100644 index 0000000..1a99255 --- /dev/null +++ b/picojson/tests/push_parser_stress_test.rs @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Comprehensive stress tests for PushParser +//! +//! Tests various buffer sizes, write chunk patterns, and edge cases to ensure +//! robustness under different memory and data delivery constraints. + +use picojson::{ + DefaultConfig, Event, JsonNumber, NumberResult, ParseError, PushParseError, PushParser, + PushParserHandler, +}; + +/// Owned event representation for comparison +#[derive(Debug, Clone, PartialEq)] +enum OwnedEvent { + StartObject, + EndObject, + StartArray, + EndArray, + Key(String), + String(String), + Number(String), + Bool(bool), + Null, + EndDocument, +} + +/// Handler that compares events immediately as they arrive +struct StressTestHandler<'expected> { + expected_events: &'expected [OwnedEvent], + current_index: usize, +} + +impl<'expected> StressTestHandler<'expected> { + fn new(expected_events: &'expected [OwnedEvent]) -> Self { + Self { + expected_events, + current_index: 0, + } + } + + fn assert_complete(&self) { + assert_eq!( + self.current_index, + self.expected_events.len(), + "Expected {} events, but only received {}", + self.expected_events.len(), + self.current_index + ); + } + + fn assert_event_matches(&mut self, received: &Event) { + assert!( + self.current_index < self.expected_events.len(), + "Received more events than expected. Got event at index {} but only expected {} events total", + self.current_index, + self.expected_events.len() + ); + + let expected = &self.expected_events[self.current_index]; + let received_owned = OwnedEvent::from_event(received); + + assert_eq!( + *expected, received_owned, + "Event mismatch at index {}", + self.current_index + ); + + self.current_index += 1; + } +} + +impl<'input, 'scratch, 'expected> PushParserHandler<'input, 'scratch, ParseError> + for StressTestHandler<'expected> +{ + fn handle_event(&mut self, event: Event<'input, 'scratch>) -> Result<(), ParseError> { + self.assert_event_matches(&event); + Ok(()) + } +} + +/// Handler for tests that expect parsing to fail - accepts any events without validation +struct PermissiveTestHandler; + +impl PermissiveTestHandler { + fn new() -> Self { + Self + } +} + +impl<'input, 'scratch> PushParserHandler<'input, 'scratch, ParseError> for PermissiveTestHandler { + fn handle_event(&mut self, _event: Event<'input, 'scratch>) -> Result<(), ParseError> { + // Accept any events - we expect the parser to fail eventually + Ok(()) + } +} + +impl OwnedEvent { + /// Convert from Event to OwnedEvent + fn from_event(event: &Event) -> Self { + match event { + Event::StartObject => OwnedEvent::StartObject, + Event::EndObject => OwnedEvent::EndObject, + Event::StartArray => OwnedEvent::StartArray, + Event::EndArray => OwnedEvent::EndArray, + Event::Key(k) => OwnedEvent::Key(k.as_ref().to_string()), + Event::String(s) => OwnedEvent::String(s.as_ref().to_string()), + Event::Number(n) => OwnedEvent::Number(n.as_str().to_string()), + Event::Bool(b) => OwnedEvent::Bool(*b), + Event::Null => OwnedEvent::Null, + Event::EndDocument => OwnedEvent::EndDocument, + } + } +} + +/// Writer that delivers data to PushParser in controlled chunks +struct ChunkedWriter<'a> { + data: &'a [u8], + pos: usize, + chunk_pattern: &'a [usize], + pattern_idx: usize, +} + +impl<'a> ChunkedWriter<'a> { + fn new(data: &'a [u8], chunk_pattern: &'a [usize]) -> Self { + Self { + data, + pos: 0, + chunk_pattern, + pattern_idx: 0, + } + } + + pub fn run<'input, H, E>( + &mut self, + mut parser: PushParser<'input, '_, H, DefaultConfig>, + ) -> Result> + where + H: for<'i, 's> PushParserHandler<'i, 's, E>, + E: From, + 'a: 'input, + { + while self.pos < self.data.len() { + let chunk_size = if self.chunk_pattern.is_empty() { + self.data.len() - self.pos + } else { + let size = self.chunk_pattern[self.pattern_idx].max(1); + self.pattern_idx = (self.pattern_idx + 1) % self.chunk_pattern.len(); + size + }; + + let end_pos = (self.pos + chunk_size).min(self.data.len()); + let chunk: &'input [u8] = &self.data[self.pos..end_pos]; + + parser.write(chunk)?; + self.pos = end_pos; + } + + parser.finish() + } +} + +/// Test scenario configuration +struct TestScenario { + name: &'static str, + json: &'static [u8], + expected_events: Vec>, + min_buffer_size: usize, +} + +/// Create comprehensive test scenarios covering various edge cases +fn get_push_parser_test_scenarios() -> Vec { + vec![ + TestScenario { + name: "Basic Object", + json: br#"{"hello": "world", "count": 42}"#, + expected_events: vec![ + Event::StartObject, + Event::Key("hello".into()), + Event::String("world".into()), + Event::Key("count".into()), + Event::Number(JsonNumber::Borrowed { + raw: "42", + parsed: NumberResult::Integer(42), + }), + Event::EndObject, + Event::EndDocument, + ], + min_buffer_size: 8, // Needs larger buffer for small chunk patterns that force copy-on-escape + }, + TestScenario { + name: "Empty Strings", + json: br#"{"": ""}"#, + expected_events: vec![ + Event::StartObject, + Event::Key("".into()), + Event::String("".into()), + Event::EndObject, + Event::EndDocument, + ], + min_buffer_size: 1, // Copy-on-escape works even for empty strings + }, + TestScenario { + name: "Long String (No Escapes)", + json: br#"["abcdefghijklmnopqrstuvwxyz"]"#, + expected_events: vec![ + Event::StartArray, + Event::String("abcdefghijklmnopqrstuvwxyz".into()), + Event::EndArray, + Event::EndDocument, + ], + min_buffer_size: 26, // String length when using small chunks that force copy-on-escape + }, + TestScenario { + name: "Long Number", + json: br#"[123456789012345678901234567890]"#, + expected_events: vec![ + Event::StartArray, + Event::Number(JsonNumber::Borrowed { + raw: "123456789012345678901234567890", + parsed: NumberResult::IntegerOverflow, + }), + Event::EndArray, + Event::EndDocument, + ], + min_buffer_size: 30, // Number length when using small chunks that force copy-on-escape + }, + TestScenario { + name: "Deeply Nested Arrays", + json: br#"[[[[[[[[[[42]]]]]]]]]]"#, + expected_events: (0..10) + .map(|_| Event::StartArray) + .chain(std::iter::once(Event::Number(JsonNumber::Borrowed { + raw: "42", + parsed: NumberResult::Integer(42), + }))) + .chain((0..10).map(|_| Event::EndArray)) + .chain(std::iter::once(Event::EndDocument)) + .collect(), + min_buffer_size: 2, // Number "42" needs 2 bytes when split by byte-by-byte processing + }, + TestScenario { + name: "Unicode Escapes", + json: br#"["\u0041\u0042\u0043"]"#, + expected_events: vec![ + Event::StartArray, + Event::String("ABC".into()), + Event::EndArray, + Event::EndDocument, + ], + min_buffer_size: 3, // Unicode processing needs buffer space for escape processing + }, + TestScenario { + name: "Mixed Escapes", + json: br#"["a\nb\t\"\\c\u1234d"]"#, + expected_events: vec![ + Event::StartArray, + Event::String("a\nb\t\"\\cሴd".into()), // Mixed escapes with Unicode \u1234 = ሴ + Event::EndArray, + Event::EndDocument, + ], + min_buffer_size: 11, // Mixed escape processing buffer including Unicode + }, + TestScenario { + name: "String ending with escape", + json: br#"["hello\\"]"#, + expected_events: vec![ + Event::StartArray, + Event::String(picojson::String::Unescaped("hello\\")), + Event::EndArray, + Event::EndDocument, + ], + min_buffer_size: 6, // Escape at end processing - copy-on-escape optimization allows smaller buffer + }, + TestScenario { + name: "Complex Nested Structure", + json: br#"{"users": [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]}"#, + expected_events: vec![ + Event::StartObject, + Event::Key("users".into()), + Event::StartArray, + Event::StartObject, + Event::Key("name".into()), + Event::String("Alice".into()), + Event::Key("age".into()), + Event::Number(JsonNumber::Borrowed { + raw: "30", + parsed: NumberResult::Integer(30), + }), + Event::EndObject, + Event::StartObject, + Event::Key("name".into()), + Event::String("Bob".into()), + Event::Key("age".into()), + Event::Number(JsonNumber::Borrowed { + raw: "25", + parsed: NumberResult::Integer(25), + }), + Event::EndObject, + Event::EndArray, + Event::EndObject, + Event::EndDocument, + ], + min_buffer_size: 5, // Longest string "Alice"/"users" when using small chunks + }, + ] +} + +/// Core test function that validates PushParser with given buffer and chunk sizes +fn test_push_parsing_with_config( + scenario: &TestScenario, + buffer_size: usize, + chunk_pattern: &[usize], +) -> Result<(), ParseError> { + let mut buffer = vec![0u8; buffer_size]; + let expected_events: Vec = scenario + .expected_events + .iter() + .map(OwnedEvent::from_event) + .collect(); + + let handler = StressTestHandler::new(&expected_events); + let parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + + let mut writer = ChunkedWriter::new(scenario.json, chunk_pattern); + + match writer.run(parser) { + Ok(handler) => { + handler.assert_complete(); + Ok(()) + } + Err(e) => match e { + PushParseError::Parse(parse_err) => Err(parse_err), + PushParseError::Handler(handler_err) => Err(handler_err), + }, + } +} + +/// Determine if a given buffer size should succeed or fail based on chunk pattern +fn should_succeed_push_parser( + buffer_size: usize, + scenario: &TestScenario, + chunk_pattern: &[usize], +) -> bool { + let min_buffer_size = get_min_buffer_size_for_scenario(scenario, chunk_pattern); + buffer_size >= min_buffer_size +} + +/// Calculate minimum buffer size based on scenario and chunk pattern +fn get_min_buffer_size_for_scenario(scenario: &TestScenario, chunk_pattern: &[usize]) -> usize { + // Some scenarios always need larger buffers due to escape processing + let needs_escape_buffer = matches!( + scenario.name, + "Unicode Escapes" | "Mixed Escapes" | "String ending with escape" + ); + + // If chunk pattern is empty (single write) or all chunks are large, + // copy-on-escape optimization allows minimal buffers - unless escape processing is needed + let has_small_chunks = chunk_pattern.iter().any(|&size| size <= 20); + + if !has_small_chunks && !needs_escape_buffer { + return 1; // Copy-on-escape optimization works well + } + + // For small chunks that force buffer boundaries or escape processing, need actual content size + match scenario.name { + "Basic Object" => { + if has_small_chunks { + 8 + } else { + 1 + } + } // Longest content: "hello", "world", "count" + "Empty Strings" => 1, // Empty strings need minimal buffer + "Long String (No Escapes)" => { + if has_small_chunks { + 26 + } else { + 1 + } + } // "abcdefghijklmnopqrstuvwxyz" + "Long Number" => { + if has_small_chunks { + 30 + } else { + 1 + } + } // "123456789012345678901234567890" + "Deeply Nested Arrays" => { + if has_small_chunks { + 2 + } else { + 1 + } + } // Number "42" + "Unicode Escapes" => 3, // Unicode processing needs minimal buffer space + "Mixed Escapes" => 11, // Mixed escape processing buffer including Unicode + "String ending with escape" => 6, // Escape at end processing + "Complex Nested Structure" => { + if has_small_chunks { + 5 + } else { + 1 + } + } // "Alice"/"users" + _ => scenario.min_buffer_size, // Use configured value for other scenarios + } +} + +#[test] +fn test_push_parser_stress_buffer_sizes() { + println!("=== PushParser Buffer Size Stress Test ==="); + let scenarios = get_push_parser_test_scenarios(); + + for scenario in &scenarios { + println!("--- Testing Scenario: {} ---", scenario.name); + + for buffer_size in 1..=50 { + let result = test_push_parsing_with_config(scenario, buffer_size, &[]); + let expected_success = should_succeed_push_parser(buffer_size, scenario, &[]); + + match (result.is_ok(), expected_success) { + (true, true) => { + println!("✅ [B={}] SUCCESS (expected)", buffer_size); + } + (false, false) => { + println!("✅ [B={}] FAIL (expected)", buffer_size); + } + (true, false) => { + panic!( + "❌ [B={}] Unexpected SUCCESS for scenario '{}'", + buffer_size, scenario.name + ); + } + (false, true) => { + panic!( + "❌ [B={}] Unexpected FAILURE for scenario '{}'", + buffer_size, scenario.name + ); + } + } + } + } +} + +#[test] +fn test_push_parser_stress_chunk_patterns() { + println!("=== PushParser Chunk Pattern Stress Test ==="); + let scenarios = get_push_parser_test_scenarios(); + + // Test patterns: Various chunk sizes to stress boundary handling + let chunk_patterns: &[&[usize]] = &[ + &[50], // Large chunks + &[10], // Medium chunks + &[1], // Byte-by-byte + &[2], // Two bytes at a time + &[3, 1, 2], // Variable small chunks + &[1, 5, 1], // Mixed tiny and small + &[7, 1, 1, 10], // Irregular pattern + ]; + + for scenario in &scenarios { + println!("--- Testing Scenario: {} ---", scenario.name); + let buffer_size = scenario.min_buffer_size + 10; // Adequate buffer + + for &pattern in chunk_patterns { + let result = test_push_parsing_with_config(scenario, buffer_size, pattern); + + match result { + Ok(()) => { + println!("✅ [P={:?}] SUCCESS", pattern); + } + Err(_e) => { + panic!( + "❌ [P={:?}] UNEXPECTED FAILURE for scenario '{}'", + pattern, scenario.name + ); + } + } + } + } +} + +#[test] +fn test_push_parser_stress_critical_matrix() { + println!("=== PushParser Critical Size Matrix Test ==="); + let scenarios = get_push_parser_test_scenarios(); + + let chunk_patterns: &[&[usize]] = &[ + &[50], // Large chunks + &[10], // Medium chunks + &[1], // Byte-by-byte + &[2], // Two bytes at a time + &[3, 1, 2], // Variable small chunks + &[1, 5, 1], // Mixed tiny and small + &[7, 1, 1, 10], // Irregular pattern + ]; + + for scenario in &scenarios { + println!("--- Testing Scenario: {} ---", scenario.name); + // Use the max min_buffer_size across all chunk patterns for this scenario + let max_min_buffer = chunk_patterns + .iter() + .map(|&pattern| get_min_buffer_size_for_scenario(scenario, pattern)) + .max() + .unwrap_or(scenario.min_buffer_size); + let critical_buffer_sizes: Vec = + (max_min_buffer.saturating_sub(2)..=max_min_buffer + 5).collect(); + + for &buffer_size in &critical_buffer_sizes { + for &pattern in chunk_patterns { + let result = test_push_parsing_with_config(scenario, buffer_size, pattern); + let expected_success = should_succeed_push_parser(buffer_size, scenario, pattern); + + match (result.is_ok(), expected_success) { + (true, true) => { + println!("✅ [B={}, P={:?}] SUCCESS (expected)", buffer_size, pattern); + } + (false, false) => { + println!("✅ [B={}, P={:?}] FAIL (expected)", buffer_size, pattern); + } + (true, false) => { + // With copy-on-escape optimization, we might succeed with smaller buffers + println!("✅ [B={}, P={:?}] Unexpected SUCCESS - copy-on-escape working better than expected", buffer_size, pattern); + } + (false, true) => { + panic!( + "❌ [B={}, P={:?}] Unexpected FAILURE for scenario '{}'", + buffer_size, pattern, scenario.name + ); + } + } + } + } + } +} + +#[test] +fn test_push_parser_stress_unicode_edge_cases() { + println!("=== PushParser Unicode Edge Cases Stress Test ==="); + + let unicode_scenarios = vec![ + TestScenario { + name: "Consecutive Unicode", + json: br#"["\u0123\u4567\u89AB\uCDEF"]"#, + expected_events: vec![ + Event::StartArray, + Event::String(picojson::String::Unescaped("ģ䕧覫췯")), + Event::EndArray, + Event::EndDocument, + ], + min_buffer_size: 25, // Unicode processing buffer for consecutive escapes + }, + TestScenario { + name: "Unicode at Chunk Boundary", + json: br#"["\u0041XYZ"]"#, + expected_events: vec![ + Event::StartArray, + Event::String(picojson::String::Unescaped("AXYZ")), + Event::EndArray, + Event::EndDocument, + ], + min_buffer_size: 15, // Unicode + normal text processing + }, + TestScenario { + name: "Empty Key with Unicode Value", + json: br#"{"": "\u2603"}"#, + expected_events: vec![ + Event::StartObject, + Event::Key("".into()), + Event::String(picojson::String::Unescaped("☃")), + Event::EndObject, + Event::EndDocument, + ], + min_buffer_size: 12, // Empty key + unicode value processing + }, + ]; + + for scenario in &unicode_scenarios { + println!("--- Testing Unicode Scenario: {} ---", scenario.name); + + // Test specifically challenging chunk patterns for unicode + let unicode_chunk_patterns: &[&[usize]] = &[ + &[1], // Byte-by-byte (challenges unicode boundaries) + &[6, 1], // Split unicode escapes + &[3, 2, 1], // Irregular splits + ]; + + let buffer_size = scenario.min_buffer_size + 5; + + for &pattern in unicode_chunk_patterns { + let result = test_push_parsing_with_config(scenario, buffer_size, pattern); + + match result { + Ok(()) => { + println!("✅ [P={:?}] Unicode SUCCESS", pattern); + } + Err(_e) => { + panic!( + "❌ [P={:?}] Unicode FAILURE for scenario '{}'", + pattern, scenario.name + ); + } + } + } + } +} + +#[test] +fn test_push_parser_stress_document_validation() { + println!("=== PushParser Document Validation Stress Test ==="); + + // Test incomplete documents that should fail + let invalid_scenarios: Vec<(&str, &[u8], &str)> = vec![ + ("Unclosed Array", b"[\"hello\"", "array not closed"), + ( + "Unclosed Object", + b"{\"key\": \"value\"", + "object not closed", + ), + ("Extra Comma", b"{\"key\": \"value\",}", "trailing comma"), + ("Missing Value", b"{\"key\":}", "missing value"), + ]; + + for (name, json, _description) in &invalid_scenarios { + println!("--- Testing Invalid: {} ---", name); + + let buffer_size = 50; // Adequate buffer + let chunk_patterns: &[&[usize]] = &[&[1], &[3], &[10]]; + + for &pattern in chunk_patterns { + let mut buffer = vec![0u8; buffer_size]; + // For invalid JSON tests, use a permissive handler that doesn't validate events + let handler = PermissiveTestHandler::new(); + let parser = PushParser::<_, DefaultConfig>::new(handler, &mut buffer); + let mut writer = ChunkedWriter::new(json, pattern); + + let result = writer.run(parser); + + if result.is_ok() { + panic!( + "❌ [P={:?}] Expected FAILURE for '{}', but got SUCCESS", + pattern, name + ); + } else { + println!("✅ [P={:?}] Correctly FAILED for '{}'", pattern, name); + } + } + } +}