From 799450c107be3eb20e65d423e95e953c95a04c74 Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Sun, 13 Jul 2025 20:47:22 -0700 Subject: [PATCH 01/10] Another big refactor --- picojson/src/chunk_reader.rs | 2 +- picojson/src/content_builder.rs | 77 ++++ picojson/src/debug_test.rs | 25 ++ picojson/src/event_processor.rs | 10 +- picojson/src/lib.rs | 14 + picojson/src/number_parser.rs | 3 + picojson/src/parser_core.rs | 155 ++++++++ picojson/src/simple_debug.rs | 35 ++ picojson/src/slice_content_builder.rs | 241 +++++++++++ picojson/src/slice_parser.rs | 253 ++++-------- picojson/src/stream_buffer.rs | 9 +- picojson/src/stream_content_builder.rs | 402 +++++++++++++++++++ picojson/src/stream_parser.rs | 531 ++++++------------------- 13 files changed, 1181 insertions(+), 576 deletions(-) create mode 100644 picojson/src/content_builder.rs create mode 100644 picojson/src/debug_test.rs create mode 100644 picojson/src/parser_core.rs create mode 100644 picojson/src/simple_debug.rs create mode 100644 picojson/src/slice_content_builder.rs create mode 100644 picojson/src/stream_content_builder.rs diff --git a/picojson/src/chunk_reader.rs b/picojson/src/chunk_reader.rs index c452a66..a72892b 100644 --- a/picojson/src/chunk_reader.rs +++ b/picojson/src/chunk_reader.rs @@ -356,7 +356,7 @@ mod integration_tests { test_delimiter_bug_helper(2, 128); } - #[test] + #[test_log::test] fn test_delimiter_bug_small_chunks_small_buffer() { // Test if larger chunks fix it: 3-byte chunks + 64-byte buffer test_delimiter_bug_helper(3, 64); diff --git a/picojson/src/content_builder.rs b/picojson/src/content_builder.rs new file mode 100644 index 0000000..4be88c4 --- /dev/null +++ b/picojson/src/content_builder.rs @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Content building and extraction trait for unifying parser content handling. +//! +//! This module provides the `ContentBuilder` trait that consolidates content extraction +//! and escape handling logic between SliceParser and StreamParser, eliminating duplication +//! while preserving each parser's performance characteristics. + +use crate::event_processor::{ContentExtractor, EscapeHandler}; +use crate::{Event, ParseError, String}; + +/// Trait for building and extracting content (strings, keys, numbers) with escape handling. +/// +/// This trait combines the functionality of `ContentExtractor` and `EscapeHandler` into a +/// unified interface that can be implemented by different parser backends while sharing +/// the core event processing logic. +pub trait ContentBuilder: ContentExtractor + EscapeHandler { + /// Begin processing a new string or key at the given position + /// + /// # Arguments + /// * `pos` - Position where the content starts + /// * `is_key` - True if this is a key, false if it's a string value + fn begin_content(&mut self, pos: usize, is_key: bool); + + /// Handle a simple escape character (after EscapeProcessor conversion) + /// + /// # Arguments + /// * `escape_char` - The unescaped character (e.g., b'\n' for "\\n") + fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError>; + + /// Handle a Unicode escape sequence, providing the resulting UTF-8 bytes + /// + /// # Arguments + /// * `utf8_bytes` - The UTF-8 encoded bytes for the Unicode codepoint + fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError>; + + /// Append a literal (non-escape) byte during content accumulation + /// + /// # Arguments + /// * `byte` - The literal byte to append + fn append_literal_byte(&mut self, byte: u8) -> Result<(), ParseError>; + + /// Begin an escape sequence (lifecycle hook) + /// Called when escape sequence processing begins (e.g., on Begin(EscapeSequence)) + fn begin_escape_sequence(&mut self) -> Result<(), ParseError>; + + /// Extract a completed string value + /// + /// # Arguments + /// * `start_pos` - Position where the string content started + fn extract_string(&mut self, start_pos: usize) -> Result, ParseError>; + + /// Extract a completed key + /// + /// # Arguments + /// * `start_pos` - Position where the key content started + fn extract_key(&mut self, start_pos: usize) -> Result, ParseError>; + + /// Extract a completed number using shared number parsing logic + /// + /// # Arguments + /// * `start_pos` - Position where the number started + /// * `from_container_end` - True if number was terminated by container delimiter + /// * `finished` - True if the parser has finished processing input (StreamParser-specific) + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError>; + + /// Get the current position in the input + fn current_position(&self) -> usize; + + /// Check if input is exhausted (for number delimiter logic) + fn is_exhausted(&self) -> bool; +} diff --git a/picojson/src/debug_test.rs b/picojson/src/debug_test.rs new file mode 100644 index 0000000..d8da888 --- /dev/null +++ b/picojson/src/debug_test.rs @@ -0,0 +1,25 @@ +#[cfg(test)] +mod debug_position_test { + use crate::{ChunkReader, Event, PullParser, StreamParser}; + + #[test_log::test] + fn debug_simple_escape_position() { + // Very simple case: just track positions + let json = b"\"a\\nb\""; + println!("JSON bytes: {:?}", json); + for (i, &b) in json.iter().enumerate() { + println!(" pos {}: '{}' ({})", i, b as char, b); + } + + let reader = ChunkReader::new(json, 8); + let mut buffer = [0u8; 32]; + let mut parser = StreamParser::<_, crate::ujson::DefaultConfig>::new(reader, &mut buffer); + + if let Event::String(s) = parser.next_event().unwrap() { + println!("Result: {:?}", s.as_str()); + assert_eq!(s.as_str(), "a\nb"); + } else { + panic!("Expected String event"); + } + } +} diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 39479ed..85a2a87 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -65,6 +65,7 @@ pub fn process_begin_events( } crate::ujson::Event::Begin(EventToken::String) => { let pos = content_extractor.current_position(); + log::debug!("[NEW] Begin(String): storing pos={} in State::String", pos); *content_extractor.parser_state_mut() = State::String(pos); content_extractor.begin_string_content(pos); Some(EventResult::Continue) @@ -219,11 +220,18 @@ pub fn process_begin_escape_sequence_event( handler: &mut H, ) -> Result<(), crate::ParseError> { // Only process if we're inside a string or key + log::debug!( + "[NEW] process_begin_escape_sequence_event called, state: {:?}", + handler.parser_state() + ); match handler.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { + log::debug!("[NEW] calling begin_escape_sequence()"); handler.begin_escape_sequence()?; } - _ => {} // Ignore if not in string/key context + _ => { + log::debug!("[NEW] ignoring escape sequence - not in string/key context"); + } // Ignore if not in string/key context } Ok(()) } diff --git a/picojson/src/lib.rs b/picojson/src/lib.rs index 02fa922..51e78bb 100644 --- a/picojson/src/lib.rs +++ b/picojson/src/lib.rs @@ -63,10 +63,24 @@ mod copy_on_escape; mod escape_processor; +mod content_builder; + +mod parser_core; + mod stream_buffer; +mod stream_content_builder; + mod stream_parser; +#[cfg(test)] +mod debug_test; + +#[cfg(test)] +mod simple_debug; + +mod slice_content_builder; + mod slice_parser; mod parse_error; diff --git a/picojson/src/number_parser.rs b/picojson/src/number_parser.rs index 4a49d03..daafa0f 100644 --- a/picojson/src/number_parser.rs +++ b/picojson/src/number_parser.rs @@ -51,6 +51,9 @@ pub fn parse_number_with_delimiter_logic( let use_full_span = !from_container_end && at_document_end; let end_pos = crate::shared::ContentRange::number_end_position(current_pos, use_full_span); + log::debug!("[NEW] parse_number_with_delimiter_logic: start_pos={}, current_pos={}, from_container_end={}, at_document_end={}, use_full_span={}, end_pos={}", + start_pos, current_pos, from_container_end, at_document_end, use_full_span, end_pos); + parse_number_event(extractor, start_pos, end_pos) } diff --git a/picojson/src/parser_core.rs b/picojson/src/parser_core.rs new file mode 100644 index 0000000..2149579 --- /dev/null +++ b/picojson/src/parser_core.rs @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Unified parser core that handles the common event processing loop. +//! +//! This module provides the `ParserCore` struct that consolidates the shared +//! event processing logic between SliceParser and StreamParser, eliminating +//! the duplication in their `next_event_impl` methods. + +use crate::content_builder::ContentBuilder; +use crate::event_processor::{ + finish_tokenizer, have_events, process_begin_escape_sequence_event, process_begin_events, + process_byte_through_tokenizer, process_simple_escape_event, process_simple_events, + process_unicode_escape_events, take_first_event, EventResult, +}; +use crate::shared::{ByteProvider, Event, ParserState, UnexpectedState}; +use crate::ujson::{EventToken, Tokenizer}; +use crate::{ujson, ParseError}; + +/// The core parser logic that handles the unified event processing loop. +/// +/// This struct contains all the shared state and logic that was previously +/// duplicated between SliceParser and StreamParser. It uses trait abstractions +/// to handle the differences in content building and byte providing. +pub struct ParserCore { + /// The tokenizer that processes JSON tokens + pub tokenizer: Tokenizer, + /// Parser state and event storage + pub parser_state: ParserState, +} + +impl ParserCore { + /// Create a new ParserCore + pub fn new() -> Self { + Self { + tokenizer: Tokenizer::new(), + parser_state: ParserState::new(), + } + } + + /// The unified event processing loop that was previously duplicated + /// between SliceParser and StreamParser. + /// + /// This method implements the common logic while delegating to traits + /// for the parser-specific differences. + pub fn next_event_impl<'a, B, CB>( + &mut self, + byte_provider: &mut B, + content_builder: &'a mut CB, + escape_timing: EscapeTiming, + ) -> Result, ParseError> + where + B: ByteProvider, + CB: ContentBuilder, + { + loop { + while !have_events(&self.parser_state.evts) { + if !self.pull_tokenizer_events(byte_provider)? { + return Ok(Event::EndDocument); + } + } + + let taken_event = take_first_event(&mut self.parser_state.evts); + let Some(taken) = taken_event else { + return Err(UnexpectedState::StateMismatch.into()); + }; + + // Try shared event processors first + if let Some(result) = process_simple_events(&taken) + .or_else(|| process_begin_events(&taken, content_builder)) + { + match result { + EventResult::Complete(event) => return Ok(event), + EventResult::ExtractString => { + return content_builder.validate_and_extract_string() + } + EventResult::ExtractKey => return content_builder.validate_and_extract_key(), + EventResult::ExtractNumber(from_container_end) => { + return content_builder.validate_and_extract_number(from_container_end) + } + EventResult::Continue => continue, + } + } + + // Handle parser-specific events based on escape timing + match taken { + ujson::Event::Begin(EventToken::EscapeSequence) => { + process_begin_escape_sequence_event(content_builder)?; + } + _ if process_unicode_escape_events(&taken, content_builder)? => { + // Unicode escape events handled by shared function + } + ujson::Event::Begin( + escape_token @ (EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab), + ) if escape_timing == EscapeTiming::OnBegin => { + // SliceParser-specific: Handle simple escape sequences on Begin events + // because CopyOnEscape requires starting unescaping immediately when + // the escape token begins to maintain zero-copy optimization + process_simple_escape_event(&escape_token, content_builder)?; + } + ujson::Event::End( + escape_token @ (EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab), + ) if escape_timing == EscapeTiming::OnEnd => { + // StreamParser-specific: Handle simple escape sequences on End events + // because StreamBuffer must wait until the token ends to accumulate + // all bytes before processing the complete escape sequence + process_simple_escape_event(&escape_token, content_builder)?; + } + _ => { + // All other events continue to next iteration + } + } + } + } + + /// Pull events from tokenizer and return whether parsing should continue. + /// This implements the common tokenizer event pulling logic. + fn pull_tokenizer_events( + &mut self, + byte_provider: &mut B, + ) -> Result { + if let Some(byte) = byte_provider.next_byte()? { + process_byte_through_tokenizer(byte, &mut self.tokenizer, &mut self.parser_state.evts)?; + } else { + finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; + + if !have_events(&self.parser_state.evts) { + return Ok(false); // Signal end of parsing + } + } + Ok(true) // Continue parsing + } +} + +/// Enum to specify when escape sequences should be processed +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum EscapeTiming { + /// Process simple escape sequences on Begin events (SliceParser) + OnBegin, + /// Process simple escape sequences on End events (StreamParser) + OnEnd, +} diff --git a/picojson/src/simple_debug.rs b/picojson/src/simple_debug.rs new file mode 100644 index 0000000..f9b8abd --- /dev/null +++ b/picojson/src/simple_debug.rs @@ -0,0 +1,35 @@ +#[cfg(test)] +mod simple_debug { + use crate::chunk_reader::ChunkReader; + use crate::*; + + #[test_log::test] + fn trace_hello_escape() { + // Just trace "hello\n" - first 7 bytes + let json = b"\"hello\\n\""; + println!("=== NEW IMPLEMENTATION ==="); + println!("Input bytes: {:?}", json); + for (i, &b) in json.iter().enumerate() { + println!(" Position {}: '{}' (byte {})", i, b as char, b); + } + + let reader = ChunkReader::new(json, 16); + let mut buffer = [0u8; 64]; + let mut parser = + stream_parser::StreamParser::<_, crate::ujson::DefaultConfig>::new(reader, &mut buffer); + + match parser.next_event() { + Ok(Event::String(s)) => { + println!("SUCCESS: Got string '{}'", s.as_str()); + println!("Expected: 'hello\\n', Got: '{}'", s.as_str()); + if s.as_str() == "hello\n" { + println!("✅ Test PASSED"); + } else { + println!("❌ Test FAILED"); + } + } + Ok(other) => println!("ERROR: Expected String, got {:?}", other), + Err(e) => println!("ERROR: Parse failed: {:?}", e), + } + } +} diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs new file mode 100644 index 0000000..da60145 --- /dev/null +++ b/picojson/src/slice_content_builder.rs @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! ContentBuilder implementation for SliceParser using CopyOnEscape optimization. + +use crate::content_builder::ContentBuilder; +use crate::copy_on_escape::CopyOnEscape; +use crate::escape_processor::UnicodeEscapeCollector; +use crate::event_processor::{ContentExtractor, EscapeHandler}; +use crate::number_parser::NumberExtractor; +use crate::shared::{ContentRange, State}; +use crate::slice_input_buffer::SliceInputBuffer; +use crate::{Event, ParseError, String}; + +/// ContentBuilder implementation for SliceParser that uses CopyOnEscape for zero-copy optimization +pub struct SliceContentBuilder<'a, 'b> { + /// The input buffer for slice-based parsing + buffer: SliceInputBuffer<'a>, + /// Copy-on-escape handler for zero-copy string optimization + copy_on_escape: CopyOnEscape<'a, 'b>, + /// Parser state tracking + parser_state: State, + /// Unicode escape collector for \uXXXX sequences + unicode_escape_collector: UnicodeEscapeCollector, +} + +impl<'a, 'b> SliceContentBuilder<'a, 'b> { + /// Create a new SliceContentBuilder + pub fn new(input: &'a [u8], scratch_buffer: &'b mut [u8]) -> Self { + Self { + buffer: SliceInputBuffer::new(input), + copy_on_escape: CopyOnEscape::new(input, scratch_buffer), + parser_state: State::None, + unicode_escape_collector: UnicodeEscapeCollector::new(), + } + } + + /// Get access to the input buffer for byte operations + pub fn buffer(&self) -> &SliceInputBuffer<'a> { + &self.buffer + } + + /// Get mutable access to the input buffer for byte operations + pub fn buffer_mut(&mut self) -> &mut SliceInputBuffer<'a> { + &mut self.buffer + } +} + +impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { + fn begin_content(&mut self, pos: usize, _is_key: bool) { + // SliceParser uses CopyOnEscape for content management + self.copy_on_escape.begin_string(pos); + } + + fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + self.copy_on_escape + .handle_escape(self.buffer.current_pos(), escape_char)?; + Ok(()) + } + + fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError> { + // SliceParser handles Unicode escapes through CopyOnEscape + // The position calculation matches the existing SliceParser logic + let current_pos = self.buffer.current_pos(); + let (_, _, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos); + + // Check if this is completing a surrogate pair + let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); + + if had_pending_high_surrogate { + // This is completing a surrogate pair - need to consume both escapes + // First call: consume the high surrogate (6 bytes earlier) + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, &[])?; + // Second call: consume the low surrogate and write UTF-8 + self.copy_on_escape + .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + } else { + // Single Unicode escape - normal processing + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + } + + Ok(()) + } + + fn append_literal_byte(&mut self, _byte: u8) -> Result<(), ParseError> { + // SliceParser doesn't typically need per-byte processing since it works with ranges + // This could be implemented as a single-byte range if needed, but for now it's a no-op + Ok(()) + } + + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + // SliceParser doesn't need special escape sequence setup + Ok(()) + } + + fn extract_string(&mut self, _start_pos: usize) -> Result, ParseError> { + // Use CopyOnEscape to get the final string result + let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); + let value_result = self.copy_on_escape.end_string(end_pos)?; + Ok(value_result) + } + + fn extract_key(&mut self, _start_pos: usize) -> Result, ParseError> { + // Use CopyOnEscape to get the final key result + let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); + let key_result = self.copy_on_escape.end_string(end_pos)?; + Ok(key_result) + } + + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + // Use shared number parsing with SliceParser-specific document end detection + // SliceParser uses buffer-based detection: buffer empty indicates document end + let at_document_end = self.buffer.is_empty(); + log::debug!("[NEW] SliceContentBuilder extract_number: start_pos={}, from_container_end={}, finished={} (ignored), buffer_empty={}, at_document_end={}", + start_pos, from_container_end, finished, self.buffer.is_empty(), at_document_end); + crate::number_parser::parse_number_with_delimiter_logic( + &self.buffer, + start_pos, + from_container_end, + at_document_end, + ) + } + + fn current_position(&self) -> usize { + self.buffer.current_pos() + } + + fn is_exhausted(&self) -> bool { + self.buffer.is_empty() + } +} + +impl<'a, 'b> ContentExtractor for SliceContentBuilder<'a, 'b> { + fn parser_state_mut(&mut self) -> &mut crate::shared::State { + &mut self.parser_state + } + + fn current_position(&self) -> usize { + self.buffer.current_pos() + } + + fn begin_string_content(&mut self, pos: usize) { + self.copy_on_escape.begin_string(pos); + } + + fn unicode_escape_collector_mut(&mut self) -> &mut UnicodeEscapeCollector { + &mut self.unicode_escape_collector + } + + fn extract_string_content( + &mut self, + _start_pos: usize, + ) -> Result, ParseError> { + let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); + let value_result = self.copy_on_escape.end_string(end_pos)?; + Ok(crate::Event::String(value_result)) + } + + fn extract_key_content( + &mut self, + _start_pos: usize, + ) -> Result, ParseError> { + let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); + let key_result = self.copy_on_escape.end_string(end_pos)?; + Ok(crate::Event::Key(key_result)) + } + + fn extract_number_content( + &mut self, + start_pos: usize, + from_container_end: bool, + ) -> Result, ParseError> { + let at_document_end = self.buffer.is_empty(); + crate::number_parser::parse_number_with_delimiter_logic( + &self.buffer, + start_pos, + from_container_end, + at_document_end, + ) + } +} + +impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { + fn parser_state(&self) -> &crate::shared::State { + &self.parser_state + } + + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + let current_pos = self.buffer.current_pos(); + let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); + + // Shared Unicode escape processing pattern + let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); + + let mut utf8_buf = [0u8; 4]; + let (utf8_bytes_opt, escape_start_pos) = + crate::escape_processor::process_unicode_escape_sequence( + current_pos, + &mut self.unicode_escape_collector, + hex_slice_provider, + &mut utf8_buf, + )?; + + // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) + if let Some(utf8_bytes) = utf8_bytes_opt { + if had_pending_high_surrogate { + // This is completing a surrogate pair - need to consume both escapes + // First call: consume the high surrogate (6 bytes earlier) + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, &[])?; + // Second call: consume the low surrogate and write UTF-8 + self.copy_on_escape + .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + } else { + // Single Unicode escape - normal processing + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + } + } + + Ok(()) + } + + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + self.copy_on_escape + .handle_escape(self.buffer.current_pos(), escape_char)?; + Ok(()) + } + + fn append_literal_byte(&mut self, _byte: u8) -> Result<(), ParseError> { + // SliceParser doesn't typically need per-byte processing since it works with ranges + // This could be implemented as a single-byte range if needed, but for now it's a no-op + Ok(()) + } +} diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index 7357daa..7bc29e3 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -1,20 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 -use crate::copy_on_escape::CopyOnEscape; -use crate::escape_processor::UnicodeEscapeCollector; -use crate::event_processor::{ - finish_tokenizer, have_events, process_begin_escape_sequence_event, process_begin_events, - process_byte_through_tokenizer, process_simple_escape_event, process_simple_events, - process_unicode_escape_events, take_first_event, ContentExtractor, EscapeHandler, EventResult, -}; -use crate::number_parser::NumberExtractor; +use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::parse_error::ParseError; -use crate::shared::{ - ByteProvider, ContentRange, Event, ParserState, PullParser, State, UnexpectedState, -}; -use crate::slice_input_buffer::{InputBuffer, SliceInputBuffer}; +use crate::parser_core::ParserCore; +use crate::shared::{ByteProvider, Event, PullParser}; +use crate::slice_content_builder::SliceContentBuilder; +use crate::slice_input_buffer::InputBuffer; use crate::ujson; -use ujson::{EventToken, Tokenizer}; use ujson::{BitStackConfig, DefaultConfig}; @@ -24,12 +16,10 @@ use ujson::{BitStackConfig, DefaultConfig}; // Lifetime 'a is the input buffer lifetime // lifetime 'b is the scratch/copy buffer lifetime pub struct SliceParser<'a, 'b, C: BitStackConfig = DefaultConfig> { - tokenizer: Tokenizer, - buffer: SliceInputBuffer<'a>, - parser_state: ParserState, - copy_on_escape: CopyOnEscape<'a, 'b>, - /// Shared Unicode escape collector for \uXXXX sequences - unicode_escape_collector: UnicodeEscapeCollector, + /// The shared parser core that handles the unified event processing loop + parser_core: ParserCore, + /// The content builder that handles SliceParser-specific content extraction + content_builder: SliceContentBuilder<'a, 'b>, } /// Methods for the pull parser. @@ -147,75 +137,95 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { input: &'a [u8], scratch_buffer: &'b mut [u8], ) -> Self { - let copy_on_escape = CopyOnEscape::new(input, scratch_buffer); SliceParser { - tokenizer: Tokenizer::new(), - buffer: SliceInputBuffer::new(input), - parser_state: ParserState::new(), - copy_on_escape, - unicode_escape_collector: UnicodeEscapeCollector::new(), + parser_core: ParserCore::new(), + content_builder: SliceContentBuilder::new(input, scratch_buffer), } } /// Returns the next JSON event or an error if parsing fails. /// Parsing continues until `EndDocument` is returned or an error occurs. fn next_event_impl(&mut self) -> Result, ParseError> { - if self.buffer.is_past_end() { - return Ok(Event::EndDocument); - } - + // We need to implement the unified loop locally to avoid borrowing conflicts + // This is essentially a copy of ParserCore::next_event_impl but accessing fields directly loop { - while !have_events(&self.parser_state.evts) { - if !self.pull_tokenizer_events()? { - return Ok(Event::EndDocument); + while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { + if let Some(byte) = self.next_byte()? { + crate::event_processor::process_byte_through_tokenizer( + byte, + &mut self.parser_core.tokenizer, + &mut self.parser_core.parser_state.evts, + )?; + } else { + crate::event_processor::finish_tokenizer( + &mut self.parser_core.tokenizer, + &mut self.parser_core.parser_state.evts, + )?; + + if !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { + return Ok(Event::EndDocument); + } } } - let taken_event = take_first_event(&mut self.parser_state.evts); + let taken_event = + crate::event_processor::take_first_event(&mut self.parser_core.parser_state.evts); let Some(taken) = taken_event else { - return Err(UnexpectedState::StateMismatch.into()); + return Err(crate::shared::UnexpectedState::StateMismatch.into()); }; // Try shared event processors first if let Some(result) = - process_simple_events(&taken).or_else(|| process_begin_events(&taken, self)) + crate::event_processor::process_simple_events(&taken).or_else(|| { + crate::event_processor::process_begin_events(&taken, &mut self.content_builder) + }) { match result { - EventResult::Complete(event) => return Ok(event), - EventResult::ExtractString => return self.validate_and_extract_string(), - EventResult::ExtractKey => return self.validate_and_extract_key(), - EventResult::ExtractNumber(from_container_end) => { - return self.validate_and_extract_number(from_container_end) + crate::event_processor::EventResult::Complete(event) => return Ok(event), + crate::event_processor::EventResult::ExtractString => { + return self.content_builder.validate_and_extract_string() + } + crate::event_processor::EventResult::ExtractKey => { + return self.content_builder.validate_and_extract_key() } - EventResult::Continue => continue, + crate::event_processor::EventResult::ExtractNumber(from_container_end) => { + return self.extract_number_with_finished_state(from_container_end) + } + crate::event_processor::EventResult::Continue => continue, } } - // Handle parser-specific events + // Handle parser-specific events (SliceParser uses OnBegin timing) match taken { - ujson::Event::Begin(EventToken::EscapeSequence) => { - process_begin_escape_sequence_event(self)?; + ujson::Event::Begin(crate::ujson::EventToken::EscapeSequence) => { + crate::event_processor::process_begin_escape_sequence_event( + &mut self.content_builder, + )?; } - _ if process_unicode_escape_events(&taken, self)? => { + _ if crate::event_processor::process_unicode_escape_events( + &taken, + &mut self.content_builder, + )? => + { // Unicode escape events handled by shared function } ujson::Event::Begin( - escape_token @ (EventToken::EscapeQuote - | EventToken::EscapeBackslash - | EventToken::EscapeSlash - | EventToken::EscapeBackspace - | EventToken::EscapeFormFeed - | EventToken::EscapeNewline - | EventToken::EscapeCarriageReturn - | EventToken::EscapeTab), + escape_token @ (crate::ujson::EventToken::EscapeQuote + | crate::ujson::EventToken::EscapeBackslash + | crate::ujson::EventToken::EscapeSlash + | crate::ujson::EventToken::EscapeBackspace + | crate::ujson::EventToken::EscapeFormFeed + | crate::ujson::EventToken::EscapeNewline + | crate::ujson::EventToken::EscapeCarriageReturn + | crate::ujson::EventToken::EscapeTab), ) => { // SliceParser-specific: Handle simple escape sequences on Begin events // because CopyOnEscape requires starting unescaping immediately when // the escape token begins to maintain zero-copy optimization - process_simple_escape_event(&escape_token, self)?; - } - ujson::Event::End(EventToken::EscapeSequence) => { - // Ignore in SliceParser since it uses slice-based parsing + crate::event_processor::process_simple_escape_event( + &escape_token, + &mut self.content_builder, + )?; } _ => { // All other events continue to next iteration @@ -224,128 +234,31 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { } } - /// Pull events from tokenizer and return whether parsing should continue - /// Returns false when past end (equivalent to self.buffer.is_past_end()) - fn pull_tokenizer_events(&mut self) -> Result { - if self.buffer.is_past_end() { - return Ok(false); // Indicate end state instead of error - } - // Use ByteProvider implementation to get the next byte and process it - if let Some(byte) = self.next_byte()? { - process_byte_through_tokenizer(byte, &mut self.tokenizer, &mut self.parser_state.evts)?; - } else { - finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; - } - Ok(!self.buffer.is_past_end()) // Return continue state - } -} - -impl<'a, 'b, C: BitStackConfig> ContentExtractor for SliceParser<'a, 'b, C> { - fn parser_state_mut(&mut self) -> &mut State { - &mut self.parser_state.state - } - - fn current_position(&self) -> usize { - self.buffer.current_pos() - } - - fn begin_string_content(&mut self, pos: usize) { - self.copy_on_escape.begin_string(pos); - } - - fn unicode_escape_collector_mut( + /// Extract number with proper SliceParser document end detection + fn extract_number_with_finished_state( &mut self, - ) -> &mut crate::escape_processor::UnicodeEscapeCollector { - &mut self.unicode_escape_collector - } - - fn extract_string_content(&mut self, _start_pos: usize) -> Result, ParseError> { - // Use CopyOnEscape to get the final string result - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let value_result = self.copy_on_escape.end_string(end_pos)?; - Ok(Event::String(value_result)) - } - - fn extract_key_content(&mut self, _start_pos: usize) -> Result, ParseError> { - // Use CopyOnEscape to get the final key result - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let key_result = self.copy_on_escape.end_string(end_pos)?; - Ok(Event::Key(key_result)) - } - - fn extract_number_content( - &mut self, - start_pos: usize, from_container_end: bool, ) -> Result, ParseError> { - // Use shared number parsing with SliceParser-specific document end detection - let at_document_end = self.buffer.is_empty(); - crate::number_parser::parse_number_with_delimiter_logic( - &self.buffer, - start_pos, - from_container_end, - at_document_end, - ) - } -} + let start_pos = match *self.content_builder.parser_state() { + crate::shared::State::Number(pos) => pos, + _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()), + }; -impl<'a, 'b, C: BitStackConfig> EscapeHandler for SliceParser<'a, 'b, C> { - fn parser_state(&self) -> &State { - &self.parser_state.state - } + *self.content_builder.parser_state_mut() = crate::shared::State::None; - fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { - let current_pos = self.buffer.current_pos(); - let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); - - // Shared Unicode escape processing pattern - let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); - - let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, escape_start_pos) = - crate::escape_processor::process_unicode_escape_sequence( - current_pos, - &mut self.unicode_escape_collector, - hex_slice_provider, - &mut utf8_buf, - )?; - - // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) - if let Some(utf8_bytes) = utf8_bytes_opt { - if had_pending_high_surrogate { - // This is completing a surrogate pair - need to consume both escapes - // First call: consume the high surrogate (6 bytes earlier) - self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; - // Second call: consume the low surrogate and write UTF-8 - self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; - } else { - // Single Unicode escape - normal processing - self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; - } - } - - Ok(()) - } - - fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { - self.copy_on_escape - .handle_escape(self.buffer.current_pos(), escape_char)?; - Ok(()) - } - - /// Append a single literal byte - implement as single-byte range for consistency - fn append_literal_byte(&mut self, _byte: u8) -> Result<(), ParseError> { - // SliceParser doesn't typically need per-byte processing since it works with ranges - // This could be implemented as a single-byte range if needed, but for now it's a no-op - Ok(()) + // SliceParser doesn't use the finished parameter - it uses buffer emptiness + // Pass finished=false since SliceParser handles document end differently + use crate::content_builder::ContentBuilder; + self.content_builder + .extract_number(start_pos, from_container_end, false) } } impl<'a, 'b, C: BitStackConfig> PullParser for SliceParser<'a, 'b, C> { fn next_event(&mut self) -> Result, ParseError> { + if self.content_builder.buffer().is_past_end() { + return Ok(Event::EndDocument); + } self.next_event_impl() } } @@ -353,7 +266,7 @@ impl<'a, 'b, C: BitStackConfig> PullParser for SliceParser<'a, 'b, C> { impl<'a, 'b, C: BitStackConfig> crate::shared::ByteProvider for SliceParser<'a, 'b, C> { fn next_byte(&mut self) -> Result, ParseError> { use crate::slice_input_buffer::InputBuffer; - match self.buffer.consume_byte() { + match self.content_builder.buffer_mut().consume_byte() { Ok(byte) => Ok(Some(byte)), Err(crate::slice_input_buffer::Error::ReachedEnd) => Ok(None), Err(err) => Err(err.into()), diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index ead407d..23929b0 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -1005,6 +1005,13 @@ impl crate::number_parser::NumberExtractor for StreamBuffer<'_> { } fn is_empty(&self) -> bool { - self.tokenize_pos >= self.data_end + let result = self.tokenize_pos >= self.data_end; + log::debug!( + "[NEW] StreamBuffer::is_empty(): tokenize_pos={}, data_end={}, result={}", + self.tokenize_pos, + self.data_end, + result + ); + result } } diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs new file mode 100644 index 0000000..3b4ef7e --- /dev/null +++ b/picojson/src/stream_content_builder.rs @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! ContentBuilder implementation for StreamParser using StreamBuffer. + +use crate::content_builder::ContentBuilder; +use crate::escape_processor::UnicodeEscapeCollector; +use crate::event_processor::{ContentExtractor, EscapeHandler}; +use crate::shared::{ContentRange, State}; +use crate::stream_buffer::StreamBuffer; +use crate::{Event, ParseError, String}; + +/// ContentBuilder implementation for StreamParser that uses StreamBuffer for streaming and escape processing +pub struct StreamContentBuilder<'b> { + /// StreamBuffer for single-buffer input and escape processing + stream_buffer: StreamBuffer<'b>, + /// Parser state tracking + parser_state: State, + /// Unicode escape collector for \uXXXX sequences + unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to reset unescaped content on next operation + unescaped_reset_queued: bool, + /// Flag to track when we're inside a Unicode escape sequence (collecting hex digits) + in_unicode_escape: bool, +} + +impl<'b> StreamContentBuilder<'b> { + /// Create a new StreamContentBuilder + pub fn new(buffer: &'b mut [u8]) -> Self { + Self { + stream_buffer: StreamBuffer::new(buffer), + parser_state: State::None, + unicode_escape_collector: UnicodeEscapeCollector::new(), + unescaped_reset_queued: false, + in_unicode_escape: false, + } + } + + /// Get access to the stream buffer for byte operations + pub fn stream_buffer(&self) -> &StreamBuffer<'b> { + &self.stream_buffer + } + + /// Get mutable access to the stream buffer for byte operations + pub fn stream_buffer_mut(&mut self) -> &mut StreamBuffer<'b> { + &mut self.stream_buffer + } + + /// Apply queued unescaped content reset if flag is set + pub fn apply_unescaped_reset_if_queued(&mut self) { + if self.unescaped_reset_queued { + self.stream_buffer.clear_unescaped(); + self.unescaped_reset_queued = false; + } + } + + /// Queue a reset of unescaped content for the next operation + fn queue_unescaped_reset(&mut self) { + self.unescaped_reset_queued = true; + } + + /// Helper to create an unescaped string from StreamBuffer + fn create_unescaped_string(&mut self) -> Result, ParseError> { + self.queue_unescaped_reset(); + let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; + let str_content = crate::shared::from_utf8(unescaped_slice)?; + Ok(String::Unescaped(str_content)) + } + + /// Helper to create a borrowed string from StreamBuffer + fn create_borrowed_string( + &mut self, + content_start: usize, + ) -> Result, ParseError> { + let current_pos = self.stream_buffer.current_position(); + let (content_start, content_end) = + ContentRange::string_content_bounds_from_content_start(content_start, current_pos); + + let bytes = self + .stream_buffer + .get_string_slice(content_start, content_end)?; + let str_content = crate::shared::from_utf8(bytes)?; + Ok(String::Borrowed(str_content)) + } + + /// Start escape processing using StreamBuffer + fn start_escape_processing(&mut self) -> Result<(), ParseError> { + // Initialize escape processing with StreamBuffer if not already started + if !self.stream_buffer.has_unescaped_content() { + if let State::String(start_pos) | State::Key(start_pos) = self.parser_state { + let current_pos = self.stream_buffer.current_position(); + + log::debug!( + "[NEW] start_escape_processing: start_pos={}, current_pos={}", + start_pos, + current_pos + ); + + // start_pos already points to content start position (not quote position) + let content_start = start_pos; + // Content to copy ends right before the escape character + let content_end = if self.unicode_escape_collector.has_pending_high_surrogate() { + // Skip copying high surrogate text when processing low surrogate + content_start + } else { + ContentRange::end_position_excluding_delimiter(current_pos) + }; + + // Estimate max length needed for unescaping (content so far + remaining buffer) + let content_len = content_end.wrapping_sub(content_start); + let max_escaped_len = self + .stream_buffer + .remaining_bytes() + .checked_add(content_len) + .ok_or(ParseError::NumericOverflow)?; + + log::debug!("[NEW] start_escape_processing: content_start={}, content_end={}, copying {} bytes", + content_start, content_end, content_end.wrapping_sub(content_start)); + + // Start unescaping with StreamBuffer and copy existing content + self.stream_buffer.start_unescaping_with_copy( + max_escaped_len, + content_start, + content_end, + )?; + } + } + + Ok(()) + } +} + +impl<'b> ContentBuilder for StreamContentBuilder<'b> { + fn begin_content(&mut self, _pos: usize, _is_key: bool) { + // StreamParser doesn't need explicit content begin processing + // as it handles content accumulation automatically + } + + fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + self.stream_buffer + .append_unescaped_byte(escape_char) + .map_err(ParseError::from) + } + + fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError> { + // StreamParser handles all escape sequences the same way - append bytes to escape buffer + for &byte in utf8_bytes { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(ParseError::from)?; + } + Ok(()) + } + + fn append_literal_byte(&mut self, byte: u8) -> Result<(), ParseError> { + // Check if we're in a string or key state and should accumulate bytes + let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); + + if in_string_mode { + // CRITICAL FIX: If we're inside a Unicode escape sequence, route hex digits to the collector + if self.in_unicode_escape { + log::debug!( + "[NEW] Unicode escape hex digit: {:02x} ('{}')", + byte, + byte as char + ); + + // Try to add the hex digit to the collector + let is_complete = self.unicode_escape_collector.add_hex_digit(byte)?; + + if is_complete { + log::debug!("[NEW] Unicode escape sequence complete, processing to UTF-8"); + + // Process the complete sequence to UTF-8 + let mut utf8_buf = [0u8; 4]; + let (utf8_bytes_opt, _) = self + .unicode_escape_collector + .process_to_utf8(&mut utf8_buf)?; + + // Write UTF-8 bytes to escape buffer if we have them + if let Some(utf8_bytes) = utf8_bytes_opt { + for &utf8_byte in utf8_bytes { + self.stream_buffer + .append_unescaped_byte(utf8_byte) + .map_err(ParseError::from)?; + } + } + + // Clear the Unicode escape state - we'll get End(UnicodeEscape) event next + self.in_unicode_escape = false; + } + + return Ok(()); + } + + // Normal literal byte processing (not inside Unicode escape) + // Skip writing bytes to escape buffer when we have a pending high surrogate + // (prevents literal \uD801 text from being included in final string) + if self.stream_buffer.has_unescaped_content() + && !self.unicode_escape_collector.has_pending_high_surrogate() + { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(ParseError::from)?; + } + } + + Ok(()) + } + + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + log::debug!("[NEW] begin_escape_sequence() called"); + self.start_escape_processing() + } + + fn extract_string(&mut self, start_pos: usize) -> Result, ParseError> { + if self.stream_buffer.has_unescaped_content() { + self.create_unescaped_string() + } else { + self.create_borrowed_string(start_pos) + } + } + + fn extract_key(&mut self, start_pos: usize) -> Result, ParseError> { + if self.stream_buffer.has_unescaped_content() { + self.queue_unescaped_reset(); + let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; + let str_content = crate::shared::from_utf8(unescaped_slice)?; + Ok(String::Unescaped(str_content)) + } else { + self.create_borrowed_string(start_pos) + } + } + + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + // Use shared number parsing with StreamParser-specific document end detection + // StreamParser uses state-based detection: finished flag indicates true document end + let at_document_end = finished; + log::debug!("[NEW] extract_number: start_pos={}, from_container_end={}, finished={}, at_document_end={}", + start_pos, from_container_end, finished, at_document_end); + crate::number_parser::parse_number_with_delimiter_logic( + &self.stream_buffer, + start_pos, + from_container_end, + at_document_end, + ) + } + + fn current_position(&self) -> usize { + self.stream_buffer.current_position() + } + + fn is_exhausted(&self) -> bool { + self.stream_buffer.is_empty() + } +} + +impl<'b> ContentExtractor for StreamContentBuilder<'b> { + fn parser_state_mut(&mut self) -> &mut State { + &mut self.parser_state + } + + fn current_position(&self) -> usize { + self.stream_buffer.current_position() + } + + fn begin_string_content(&mut self, _pos: usize) { + // StreamParser doesn't need explicit string begin processing + // as it handles content accumulation automatically + } + + fn unicode_escape_collector_mut(&mut self) -> &mut UnicodeEscapeCollector { + &mut self.unicode_escape_collector + } + + fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { + let string = if self.stream_buffer.has_unescaped_content() { + self.create_unescaped_string()? + } else { + self.create_borrowed_string(start_pos)? + }; + Ok(Event::String(string)) + } + + fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { + let key = if self.stream_buffer.has_unescaped_content() { + self.queue_unescaped_reset(); + let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; + let str_content = crate::shared::from_utf8(unescaped_slice)?; + String::Unescaped(str_content) + } else { + self.create_borrowed_string(start_pos)? + }; + Ok(Event::Key(key)) + } + + fn extract_number_content( + &mut self, + start_pos: usize, + from_container_end: bool, + ) -> Result, ParseError> { + // LEGACY METHOD: This should be updated to use the new ContentBuilder::extract_number + // For now, use conservative approach - assume not finished for ContentExtractor calls + let finished = false; // Conservative: assume not at document end for legacy calls + let at_document_end = finished; + log::debug!("[NEW] extract_number_content LEGACY: start_pos={}, from_container_end={}, assuming finished={}, at_document_end={}", + start_pos, from_container_end, finished, at_document_end); + crate::number_parser::parse_number_with_delimiter_logic( + &self.stream_buffer, + start_pos, + from_container_end, + at_document_end, + ) + } +} + +impl<'b> EscapeHandler for StreamContentBuilder<'b> { + fn parser_state(&self) -> &State { + &self.parser_state + } + + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts + let utf8_bytes_result = { + let current_pos = self.stream_buffer.current_position(); + let hex_slice_provider = |start, end| { + self.stream_buffer + .get_string_slice(start, end) + .map_err(Into::into) + }; + + let mut utf8_buf = [0u8; 4]; + let (utf8_bytes_opt, _escape_start_pos) = + crate::escape_processor::process_unicode_escape_sequence( + current_pos, + &mut self.unicode_escape_collector, + hex_slice_provider, + &mut utf8_buf, + )?; + + // Copy UTF-8 bytes to avoid borrow conflicts + utf8_bytes_opt.map(|bytes| { + let mut copy = [0u8; 4]; + let len = bytes.len(); + if let Some(dest) = copy.get_mut(..len) { + dest.copy_from_slice(bytes); + } + (copy, len) + }) + }; + + // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) + if let Some((utf8_bytes, len)) = utf8_bytes_result { + // StreamParser handles all escape sequences the same way - append bytes to escape buffer + // Use safe slice access to avoid panic + if let Some(valid_bytes) = utf8_bytes.get(..len) { + for &byte in valid_bytes { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(ParseError::from)?; + } + } + } + + Ok(()) + } + + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + self.stream_buffer + .append_unescaped_byte(escape_char) + .map_err(ParseError::from) + } + + fn append_literal_byte(&mut self, byte: u8) -> Result<(), ParseError> { + // Check if we're in a string or key state and should accumulate bytes + let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); + + if in_string_mode { + // Skip writing bytes to escape buffer when we have a pending high surrogate + // (prevents literal \uD801 text from being included in final string) + if self.stream_buffer.has_unescaped_content() + && !self.unicode_escape_collector.has_pending_high_surrogate() + { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(ParseError::from)?; + } + } + + Ok(()) + } + + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + log::debug!("[NEW] EscapeHandler::begin_escape_sequence() called"); + // Delegate to ContentBuilder implementation + ContentBuilder::begin_escape_sequence(self) + } +} diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index 3fc77d1..7a52410 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -1,16 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 -use crate::escape_processor::UnicodeEscapeCollector; -use crate::event_processor::{ - finish_tokenizer, have_events, process_begin_escape_sequence_event, process_begin_events, - process_byte_through_tokenizer, process_simple_escape_event, process_simple_events, - process_unicode_escape_events, take_first_event, ContentExtractor, EscapeHandler, EventResult, -}; +use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::parse_error::ParseError; -use crate::shared::{ByteProvider, ContentRange, Event, ParserState}; -use crate::stream_buffer::StreamBuffer; +use crate::parser_core::ParserCore; +use crate::shared::{ByteProvider, Event}; +use crate::stream_content_builder::StreamContentBuilder; use crate::{ujson, PullParser}; -use ujson::{EventToken, Tokenizer}; use ujson::{BitStackConfig, DefaultConfig}; @@ -29,39 +24,20 @@ pub trait Reader { fn read(&mut self, buf: &mut [u8]) -> Result; } -/// Represents the processing state of the StreamParser -/// Enforces logical invariants: once Finished, no other processing states are possible -#[derive(Debug)] -enum ProcessingState { - /// Normal active processing - Active { - unescaped_reset_queued: bool, - in_escape_sequence: bool, - }, - /// All input consumed, tokenizer finished - Finished, -} - /// A pull parser that parses JSON from a stream. /// /// Generic over BitStackConfig for configurable nesting depth. /// It is designed to be used with the [Reader] trait, which is used to read data from a stream. /// pub struct StreamParser<'b, R: Reader, C: BitStackConfig = DefaultConfig> { - /// The tokenizer that processes JSON tokens - tokenizer: Tokenizer, - /// Parser state tracking - parser_state: ParserState, + /// The shared parser core that handles the unified event processing loop + parser_core: ParserCore, + /// The content builder that handles StreamParser-specific content extraction + content_builder: StreamContentBuilder<'b>, /// Reader for streaming input reader: R, - /// StreamBuffer for single-buffer input and escape processing - stream_buffer: StreamBuffer<'b>, - - /// Processing state machine that enforces logical invariants - processing_state: ProcessingState, - - /// Shared Unicode escape collector for \uXXXX sequences - unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to track if input is finished + finished: bool, } /// Methods for StreamParser using DefaultConfig @@ -96,18 +72,10 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> { /// ``` pub fn with_config(reader: R, buffer: &'b mut [u8]) -> Self { Self { - tokenizer: Tokenizer::new(), - parser_state: ParserState::new(), + parser_core: ParserCore::new(), + content_builder: StreamContentBuilder::new(buffer), reader, - stream_buffer: StreamBuffer::new(buffer), - - // Initialize new state machine to Active with default values - processing_state: ProcessingState::Active { - unescaped_reset_queued: false, - in_escape_sequence: false, - }, - - unicode_escape_collector: UnicodeEscapeCollector::new(), + finished: false, } } } @@ -116,57 +84,108 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> { impl StreamParser<'_, R, C> { /// Get the next JSON event from the stream fn next_event_impl(&mut self) -> Result, ParseError> { - self.apply_unescaped_reset_if_queued(); - + // We need to implement the unified loop locally to avoid borrowing conflicts + // This is essentially a copy of ParserCore::next_event_impl but accessing fields directly loop { - while !have_events(&self.parser_state.evts) { - if !self.pull_tokenizer_events()? { - return Ok(Event::EndDocument); + while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { + if let Some(byte) = self.next_byte()? { + let pos_before = self.content_builder.current_position(); + log::trace!( + "[NEW] Processing byte '{}' ({}) at pos {}", + byte as char, + byte, + pos_before + ); + + crate::event_processor::process_byte_through_tokenizer( + byte, + &mut self.parser_core.tokenizer, + &mut self.parser_core.parser_state.evts, + )?; + + let has_events = + crate::event_processor::have_events(&self.parser_core.parser_state.evts); + log::trace!("[NEW] After tokenizer: has_events={}", has_events); + + // Handle byte accumulation if no event was generated (StreamParser-specific) + if !has_events { + log::trace!("[NEW] No events generated, calling handle_byte_accumulation"); + self.handle_byte_accumulation(byte)?; + } + } else { + // Handle end of data with tokenizer finish + if !self.finished { + self.finished = true; + crate::event_processor::finish_tokenizer( + &mut self.parser_core.tokenizer, + &mut self.parser_core.parser_state.evts, + )?; + } + + if !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { + return Ok(Event::EndDocument); + } } } - let taken_event = take_first_event(&mut self.parser_state.evts); + let taken_event = + crate::event_processor::take_first_event(&mut self.parser_core.parser_state.evts); let Some(taken) = taken_event else { return Err(crate::shared::UnexpectedState::StateMismatch.into()); }; // Try shared event processors first if let Some(result) = - process_simple_events(&taken).or_else(|| process_begin_events(&taken, self)) + crate::event_processor::process_simple_events(&taken).or_else(|| { + crate::event_processor::process_begin_events(&taken, &mut self.content_builder) + }) { match result { - EventResult::Complete(event) => return Ok(event), - EventResult::ExtractString => return self.validate_and_extract_string(), - EventResult::ExtractKey => return self.validate_and_extract_key(), - EventResult::ExtractNumber(from_container_end) => { - return self.validate_and_extract_number(from_container_end) + crate::event_processor::EventResult::Complete(event) => return Ok(event), + crate::event_processor::EventResult::ExtractString => { + return self.content_builder.validate_and_extract_string() } - EventResult::Continue => continue, + crate::event_processor::EventResult::ExtractKey => { + return self.content_builder.validate_and_extract_key() + } + crate::event_processor::EventResult::ExtractNumber(from_container_end) => { + return self.extract_number_with_finished_state(from_container_end) + } + crate::event_processor::EventResult::Continue => continue, } } - // Handle parser-specific events + // Handle parser-specific events (StreamParser uses OnEnd timing) match taken { - ujson::Event::Begin(EventToken::EscapeSequence) => { - process_begin_escape_sequence_event(self)?; + ujson::Event::Begin(crate::ujson::EventToken::EscapeSequence) => { + crate::event_processor::process_begin_escape_sequence_event( + &mut self.content_builder, + )?; } - _ if process_unicode_escape_events(&taken, self)? => { + _ if crate::event_processor::process_unicode_escape_events( + &taken, + &mut self.content_builder, + )? => + { // Unicode escape events handled by shared function } ujson::Event::End( - escape_token @ (EventToken::EscapeQuote - | EventToken::EscapeBackslash - | EventToken::EscapeSlash - | EventToken::EscapeBackspace - | EventToken::EscapeFormFeed - | EventToken::EscapeNewline - | EventToken::EscapeCarriageReturn - | EventToken::EscapeTab), + escape_token @ (crate::ujson::EventToken::EscapeQuote + | crate::ujson::EventToken::EscapeBackslash + | crate::ujson::EventToken::EscapeSlash + | crate::ujson::EventToken::EscapeBackspace + | crate::ujson::EventToken::EscapeFormFeed + | crate::ujson::EventToken::EscapeNewline + | crate::ujson::EventToken::EscapeCarriageReturn + | crate::ujson::EventToken::EscapeTab), ) => { // StreamParser-specific: Handle simple escape sequences on End events // because StreamBuffer must wait until the token ends to accumulate // all bytes before processing the complete escape sequence - process_simple_escape_event(&escape_token, self)?; + crate::event_processor::process_simple_escape_event( + &escape_token, + &mut self.content_builder, + )?; } _ => { // All other events continue to next iteration @@ -175,98 +194,48 @@ impl StreamParser<'_, R, C> { } } - /// Pull events from tokenizer and return whether parsing should continue - /// Returns false when finished (equivalent to ProcessingState::Finished) - fn pull_tokenizer_events(&mut self) -> Result { - if let Some(byte) = self.next_byte()? { - // Process byte through tokenizer using shared logic - process_byte_through_tokenizer(byte, &mut self.tokenizer, &mut self.parser_state.evts)?; - - // Handle byte accumulation if no event was generated (StreamParser-specific) - if !have_events(&self.parser_state.evts) { - self.handle_byte_accumulation(byte)?; - } - } else { - // Handle end of data with tokenizer finish - if !matches!(self.processing_state, ProcessingState::Finished) { - self.processing_state = ProcessingState::Finished; - - // Use shared logic for finish - finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; - } - - if !have_events(&self.parser_state.evts) { - return Ok(false); // Signal end of parsing - } - // Continue to process any events generated by finish() - } - Ok(true) // Continue parsing - } - - /// Helper to create an unescaped string from StreamBuffer - fn create_unescaped_string(&mut self) -> Result, ParseError> { - self.queue_unescaped_reset(); - let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; - let str_content = crate::shared::from_utf8(unescaped_slice)?; - Ok(Event::String(crate::String::Unescaped(str_content))) - } - - /// Helper to create a borrowed string from StreamBuffer - fn create_borrowed_string( + /// Extract number with proper StreamParser document end detection + fn extract_number_with_finished_state( &mut self, - content_start: usize, + from_container_end: bool, ) -> Result, ParseError> { - let current_pos = self.stream_buffer.current_position(); - let (content_start, content_end) = - ContentRange::string_content_bounds_from_content_start(content_start, current_pos); - - let bytes = self - .stream_buffer - .get_string_slice(content_start, content_end)?; - let str_content = crate::shared::from_utf8(bytes)?; - Ok(Event::String(crate::String::Borrowed(str_content))) - } + let start_pos = match *self.content_builder.parser_state() { + crate::shared::State::Number(pos) => pos, + _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()), + }; - /// Helper to create an unescaped key from StreamBuffer - fn create_unescaped_key(&mut self) -> Result, ParseError> { - self.queue_unescaped_reset(); - let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; - let str_content = crate::shared::from_utf8(unescaped_slice)?; - Ok(Event::Key(crate::String::Unescaped(str_content))) - } + *self.content_builder.parser_state_mut() = crate::shared::State::None; - /// Helper to create a borrowed key from StreamBuffer - fn create_borrowed_key(&mut self, content_start: usize) -> Result, ParseError> { - let current_pos = self.stream_buffer.current_position(); - let (content_start, content_end) = - ContentRange::string_content_bounds_from_content_start(content_start, current_pos); - - let bytes = self - .stream_buffer - .get_string_slice(content_start, content_end)?; - let str_content = crate::shared::from_utf8(bytes)?; - Ok(Event::Key(crate::String::Borrowed(str_content))) + // Use the new ContentBuilder::extract_number method with finished state + use crate::content_builder::ContentBuilder; + self.content_builder + .extract_number(start_pos, from_container_end, self.finished) } /// Fill buffer from reader fn fill_buffer_from_reader(&mut self) -> Result<(), ParseError> { - if let Some(fill_slice) = self.stream_buffer.get_fill_slice() { + if let Some(fill_slice) = self.content_builder.stream_buffer_mut().get_fill_slice() { let bytes_read = self .reader .read(fill_slice) .map_err(|_| ParseError::ReaderError)?; - self.stream_buffer.mark_filled(bytes_read)?; + self.content_builder + .stream_buffer_mut() + .mark_filled(bytes_read)?; } else { // Buffer is full - ALWAYS attempt compaction - let compact_start_pos = match self.parser_state.state { + let compact_start_pos = match *self.content_builder.parser_state_mut() { crate::shared::State::Number(start_pos) => start_pos, crate::shared::State::Key(start_pos) => start_pos, crate::shared::State::String(start_pos) => start_pos, - _ => self.stream_buffer.current_position(), + _ => self.content_builder.stream_buffer().current_position(), }; - let offset = self.stream_buffer.compact_from(compact_start_pos)?; + let offset = self + .content_builder + .stream_buffer_mut() + .compact_from(compact_start_pos)?; if offset == 0 { // SOL: Buffer too small for current token @@ -277,13 +246,15 @@ impl StreamParser<'_, R, C> { self.update_positions_after_compaction(offset)?; // Try to fill again after compaction - if let Some(fill_slice) = self.stream_buffer.get_fill_slice() { + if let Some(fill_slice) = self.content_builder.stream_buffer_mut().get_fill_slice() { let bytes_read = self .reader .read(fill_slice) .map_err(|_| ParseError::ReaderError)?; - self.stream_buffer.mark_filled(bytes_read)?; + self.content_builder + .stream_buffer_mut() + .mark_filled(bytes_read)?; } } Ok(()) @@ -293,7 +264,7 @@ impl StreamParser<'_, R, C> { fn update_positions_after_compaction(&mut self, offset: usize) -> Result<(), ParseError> { // Check for positions that would be discarded and need escape mode // CRITICAL: Position 0 is never discarded, regardless of offset - let needs_escape_mode = match &self.parser_state.state { + let needs_escape_mode = match self.content_builder.parser_state_mut() { crate::shared::State::Key(pos) if *pos > 0 && *pos < offset => Some((*pos, true)), // true = is_key crate::shared::State::String(pos) if *pos > 0 && *pos < offset => Some((*pos, false)), // false = is_string crate::shared::State::Number(pos) if *pos > 0 && *pos < offset => { @@ -312,7 +283,7 @@ impl StreamParser<'_, R, C> { } // Update positions - match &mut self.parser_state.state { + match self.content_builder.parser_state_mut() { crate::shared::State::None => { // No position-based state to update } @@ -398,262 +369,16 @@ impl StreamParser<'_, R, C> { /// Handle byte accumulation for strings/keys and Unicode escape sequences fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), ParseError> { - // Use shared literal byte append logic - self.append_literal_byte(byte) - } - - /// Start escape processing using StreamBuffer - fn start_escape_processing(&mut self) -> Result<(), ParseError> { - // Update escape state in enum - if let ProcessingState::Active { - ref mut in_escape_sequence, - .. - } = self.processing_state - { - *in_escape_sequence = true; - } - - // Initialize escape processing with StreamBuffer if not already started - if !self.stream_buffer.has_unescaped_content() { - if let crate::shared::State::String(start_pos) | crate::shared::State::Key(start_pos) = - self.parser_state.state - { - let current_pos = self.stream_buffer.current_position(); - - // With content tracking, start_pos is the content_start - let content_start = start_pos; - // Content to copy ends right before the escape character - let content_end = if self.unicode_escape_collector.has_pending_high_surrogate() { - // Skip copying high surrogate text when processing low surrogate - content_start - } else { - ContentRange::end_position_excluding_delimiter(current_pos) - }; - - // Estimate max length needed for unescaping (content so far + remaining buffer) - let content_len = content_end.wrapping_sub(content_start); - let max_escaped_len = self - .stream_buffer - .remaining_bytes() - .checked_add(content_len) - .ok_or(ParseError::NumericOverflow)?; - - // Start unescaping with StreamBuffer and copy existing content - self.stream_buffer.start_unescaping_with_copy( - max_escaped_len, - content_start, - content_end, - )?; - } - } - - Ok(()) - } - - /// Append a byte to the StreamBuffer's unescaped content - fn append_byte_to_escape_buffer(&mut self, byte: u8) -> Result<(), ParseError> { - self.stream_buffer - .append_unescaped_byte(byte) - .map_err(|e| e.into()) - } - - /// Queue a reset of unescaped content for the next next_event() call - fn queue_unescaped_reset(&mut self) { - // Set the reset flag in the Active state - if let ProcessingState::Active { - ref mut unescaped_reset_queued, - .. - } = self.processing_state - { - *unescaped_reset_queued = true; - } - } - - /// Apply queued unescaped content reset if flag is set - fn apply_unescaped_reset_if_queued(&mut self) { - // Check the enum field first - let should_reset = if let ProcessingState::Active { - ref mut unescaped_reset_queued, - .. - } = self.processing_state - { - let needs_reset = *unescaped_reset_queued; - *unescaped_reset_queued = false; // Clear the flag - needs_reset - } else { - false - }; - - if should_reset { - self.stream_buffer.clear_unescaped(); - } - } -} - -impl<'b, R: Reader, C: BitStackConfig> ContentExtractor for StreamParser<'b, R, C> { - fn parser_state_mut(&mut self) -> &mut crate::shared::State { - &mut self.parser_state.state - } - - fn current_position(&self) -> usize { - self.stream_buffer.current_position() - } - - fn begin_string_content(&mut self, _pos: usize) { - // StreamParser doesn't need explicit string begin processing - // as it handles content accumulation automatically - } - - fn unicode_escape_collector_mut( - &mut self, - ) -> &mut crate::escape_processor::UnicodeEscapeCollector { - &mut self.unicode_escape_collector - } - - fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { - if self.stream_buffer.has_unescaped_content() { - self.create_unescaped_string() - } else { - self.create_borrowed_string(start_pos) - } - } - - fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { - if self.stream_buffer.has_unescaped_content() { - self.create_unescaped_key() - } else { - self.create_borrowed_key(start_pos) - } - } - - fn extract_number_content( - &mut self, - start_pos: usize, - from_container_end: bool, - ) -> Result, ParseError> { - // Use shared number parsing with StreamParser-specific document end detection - let at_document_end = matches!(self.processing_state, ProcessingState::Finished); - crate::number_parser::parse_number_with_delimiter_logic( - &self.stream_buffer, - start_pos, - from_container_end, - at_document_end, - ) - } -} - -impl<'b, R: Reader, C: BitStackConfig> EscapeHandler for StreamParser<'b, R, C> { - fn parser_state(&self) -> &crate::shared::State { - &self.parser_state.state - } - - fn process_unicode_escape_with_collector(&mut self) -> Result<(), crate::ParseError> { - // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts - let utf8_bytes_result = { - let current_pos = self.stream_buffer.current_position(); - let hex_slice_provider = |start, end| { - self.stream_buffer - .get_string_slice(start, end) - .map_err(Into::into) - }; - - let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, _escape_start_pos) = - crate::escape_processor::process_unicode_escape_sequence( - current_pos, - &mut self.unicode_escape_collector, - hex_slice_provider, - &mut utf8_buf, - )?; - - // Copy UTF-8 bytes to avoid borrow conflicts - utf8_bytes_opt.map(|bytes| { - let mut copy = [0u8; 4]; - let len = bytes.len(); - if let Some(dest) = copy.get_mut(..len) { - dest.copy_from_slice(bytes); - } - (copy, len) - }) - }; - - // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) - if let Some((utf8_bytes, len)) = utf8_bytes_result { - // StreamParser handles all escape sequences the same way - append bytes to escape buffer - // Use safe slice access to avoid panic - if let Some(valid_bytes) = utf8_bytes.get(..len) { - for &byte in valid_bytes { - self.append_byte_to_escape_buffer(byte)?; - } - } - } - - // Update escape state in enum - Unicode escape processing is complete - if let ProcessingState::Active { - ref mut in_escape_sequence, - .. - } = self.processing_state - { - *in_escape_sequence = false; - } - - Ok(()) - } - - fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), crate::ParseError> { - // Update escape state in enum - if let ProcessingState::Active { - ref mut in_escape_sequence, - .. - } = self.processing_state - { - *in_escape_sequence = false; - } - - self.append_byte_to_escape_buffer(escape_char)?; - Ok(()) - } - - /// Begin escape sequence processing - calls StreamParser's start_escape_processing - fn begin_escape_sequence(&mut self) -> Result<(), crate::ParseError> { - self.start_escape_processing() - } - - /// Append a single literal byte - StreamParser's per-byte accumulation pattern - fn append_literal_byte(&mut self, byte: u8) -> Result<(), crate::ParseError> { - // Check if we're in a string or key state and should accumulate bytes - let in_string_mode = matches!( - self.parser_state.state, - crate::shared::State::String(_) | crate::shared::State::Key(_) - ); - - if in_string_mode { - // Access escape state from enum - let in_escape = if let ProcessingState::Active { - in_escape_sequence, .. - } = &self.processing_state - { - *in_escape_sequence - } else { - false - }; - - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if !in_escape - && self.stream_buffer.has_unescaped_content() - && !self.unicode_escape_collector.has_pending_high_surrogate() - { - self.append_byte_to_escape_buffer(byte)?; - } - } - - Ok(()) + // Use ContentBuilder's literal byte append logic + use crate::content_builder::ContentBuilder; + ContentBuilder::append_literal_byte(&mut self.content_builder, byte) } } impl<'b, R: Reader, C: BitStackConfig> PullParser for StreamParser<'b, R, C> { fn next_event(&mut self) -> Result, ParseError> { + self.content_builder.apply_unescaped_reset_if_queued(); + self.next_event_impl() } } @@ -661,18 +386,18 @@ impl<'b, R: Reader, C: BitStackConfig> PullParser for StreamParser<'b, R, C> { impl<'b, R: Reader, C: BitStackConfig> crate::shared::ByteProvider for StreamParser<'b, R, C> { fn next_byte(&mut self) -> Result, ParseError> { // If buffer is empty, try to fill it first - if self.stream_buffer.is_empty() { + if self.content_builder.stream_buffer().is_empty() { self.fill_buffer_from_reader()?; } // If still empty after fill attempt, we're at EOF - if self.stream_buffer.is_empty() { + if self.content_builder.stream_buffer().is_empty() { return Ok(None); } // Get byte and advance - let byte = self.stream_buffer.current_byte()?; - self.stream_buffer.advance()?; + let byte = self.content_builder.stream_buffer().current_byte()?; + self.content_builder.stream_buffer_mut().advance()?; Ok(Some(byte)) } } @@ -759,7 +484,7 @@ mod tests { assert!(matches!(event, Event::EndDocument)); } - #[test] + #[test_log::test] fn test_direct_parser_simple_escape() { let json = b"\"hello\\nworld\""; let reader = SliceReader::new(json); From a74336d754d2ddceb784aa01054e16ab31a6ddeb Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Sun, 13 Jul 2025 23:04:36 -0700 Subject: [PATCH 02/10] Chunk update --- picojson/src/escape_processor.rs | 12 +++ picojson/src/event_processor.rs | 19 +++- picojson/src/slice_content_builder.rs | 36 ++++++-- picojson/src/slice_parser.rs | 19 ++++ picojson/src/stream_content_builder.rs | 98 +++++++++++---------- picojson/tests/stream_parser_stress_test.rs | 49 ++++++++++- 6 files changed, 178 insertions(+), 55 deletions(-) diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs index 2b9e513..c69a2be 100644 --- a/picojson/src/escape_processor.rs +++ b/picojson/src/escape_processor.rs @@ -150,6 +150,9 @@ impl EscapeProcessor { let digit = Self::validate_hex_digit(byte)?; codepoint = (codepoint << 4) | digit; } + + log::debug!("NEW process_unicode_escape: hex_slice={:?}, codepoint=0x{:04X}, pending_high_surrogate={:?}", + hex_slice, codepoint, pending_high_surrogate); // Check if we have a pending high surrogate if let Some(high) = pending_high_surrogate { @@ -159,23 +162,29 @@ impl EscapeProcessor { let combined = Self::combine_surrogate_pair(high, codepoint)?; let ch = char::from_u32(combined).ok_or(ParseError::InvalidUnicodeCodepoint)?; let utf8_str = ch.encode_utf8(utf8_buffer); + log::debug!("NEW process_unicode_escape: Surrogate pair combined: high=0x{:04X}, low=0x{:04X}, combined=0x{:04X}, utf8={:?}", + high, codepoint, combined, utf8_str.as_bytes()); Ok((Some(utf8_str.as_bytes()), None)) } else { // Error: high surrogate not followed by low surrogate + log::debug!("NEW process_unicode_escape: Error - high surrogate not followed by low surrogate"); Err(ParseError::InvalidUnicodeCodepoint) } } else { // No pending high surrogate if Self::is_high_surrogate(codepoint) { // Save this high surrogate for the next escape + log::debug!("NEW process_unicode_escape: High surrogate saved: 0x{:04X}", codepoint); Ok((None, Some(codepoint))) } else if Self::is_low_surrogate(codepoint) { // Error: low surrogate without preceding high surrogate + log::debug!("NEW process_unicode_escape: Error - low surrogate without preceding high surrogate"); Err(ParseError::InvalidUnicodeCodepoint) } else { // Regular Unicode character let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?; let utf8_str = ch.encode_utf8(utf8_buffer); + log::debug!("NEW process_unicode_escape: Regular Unicode character: 0x{:04X} -> {:?}", codepoint, utf8_str.as_bytes()); Ok((Some(utf8_str.as_bytes()), None)) } } @@ -670,6 +679,9 @@ where // Extract the 4 hex digits from the buffer using the provider let hex_slice = hex_slice_provider(hex_start, hex_end)?; + + log::debug!("NEW process_unicode_escape_sequence: current_pos={}, hex_start={}, hex_end={}, escape_start_pos={}, hex_slice={:?}", + current_pos, hex_start, hex_end, escape_start_pos, hex_slice); if hex_slice.len() != 4 { return Err(UnexpectedState::InvalidUnicodeEscape.into()); diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 85a2a87..9fb445c 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -30,6 +30,12 @@ pub trait EscapeHandler { fn append_literal_byte(&mut self, _byte: u8) -> Result<(), crate::ParseError> { Ok(()) } + + /// Begin unicode escape sequence processing + /// Default implementation is no-op - suitable for parsers that don't need special handling + fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { + Ok(()) + } } /// Result of processing a tokenizer event @@ -267,23 +273,32 @@ pub fn process_unicode_escape_events( ) -> Result { match event { crate::ujson::Event::Begin(EventToken::UnicodeEscape) => { + log::debug!("NEW event_processor: Begin(UnicodeEscape) event received"); // Start Unicode escape collection - reset collector for new sequence // Only handle if we're inside a string or key match content_extractor.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { + log::debug!("NEW event_processor: Resetting unicode escape collector"); content_extractor.unicode_escape_collector_mut().reset(); + content_extractor.begin_unicode_escape()?; } - _ => {} // Ignore if not in string/key context + _ => { + log::debug!("NEW event_processor: Ignoring unicode escape outside string/key context"); + } // Ignore if not in string/key context } Ok(true) // Event was handled } crate::ujson::Event::End(EventToken::UnicodeEscape) => { + log::debug!("NEW event_processor: End(UnicodeEscape) event received"); // Handle end of Unicode escape sequence (\uXXXX) match content_extractor.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { + log::debug!("NEW event_processor: Processing unicode escape with collector"); content_extractor.process_unicode_escape_with_collector()?; } - _ => {} // Ignore if not in string/key context + _ => { + log::debug!("NEW event_processor: Ignoring unicode escape end outside string/key context"); + } // Ignore if not in string/key context } Ok(true) // Event was handled } diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs index da60145..17b4720 100644 --- a/picojson/src/slice_content_builder.rs +++ b/picojson/src/slice_content_builder.rs @@ -21,6 +21,8 @@ pub struct SliceContentBuilder<'a, 'b> { parser_state: State, /// Unicode escape collector for \uXXXX sequences unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to track when we're inside ANY escape sequence (like stream implementation) + in_escape_sequence: bool, } impl<'a, 'b> SliceContentBuilder<'a, 'b> { @@ -31,6 +33,7 @@ impl<'a, 'b> SliceContentBuilder<'a, 'b> { copy_on_escape: CopyOnEscape::new(input, scratch_buffer), parser_state: State::None, unicode_escape_collector: UnicodeEscapeCollector::new(), + in_escape_sequence: false, } } @@ -52,6 +55,10 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { } fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + log::debug!("[SLICE] handle_simple_escape: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); + self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) @@ -62,6 +69,8 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { // The position calculation matches the existing SliceParser logic let current_pos = self.buffer.current_pos(); let (_, _, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos); + // Fix: escape_start_pos should point to the backslash, not the 'u' + let actual_escape_start_pos = escape_start_pos.saturating_sub(1); // Check if this is completing a surrogate pair let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); @@ -70,14 +79,14 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; + .handle_unicode_escape(actual_escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; } else { // Single Unicode escape - normal processing self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; } Ok(()) @@ -90,7 +99,9 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { } fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { - // SliceParser doesn't need special escape sequence setup + // Set escape flag to prevent literal byte accumulation during escape processing + self.in_escape_sequence = true; + log::debug!("[SLICE] begin_escape_sequence: Setting in_escape_sequence=true"); Ok(()) } @@ -192,6 +203,9 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { } fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Clear the escape sequence flag when unicode escape completes + self.in_escape_sequence = false; + log::debug!("[SLICE] process_unicode_escape_with_collector: Clearing in_escape_sequence flag"); let current_pos = self.buffer.current_pos(); let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); @@ -206,6 +220,10 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { hex_slice_provider, &mut utf8_buf, )?; + // Fix: escape_start_pos should point to the backslash, not the 'u' + // But don't subtract if it's already pointing to the backslash + let actual_escape_start_pos = escape_start_pos; + log::debug!("SLICE CONTENT BUILDER process_unicode_escape_with_collector_impl: using actual_escape_start_pos={} (was {})", actual_escape_start_pos, escape_start_pos); // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) if let Some(utf8_bytes) = utf8_bytes_opt { @@ -213,14 +231,14 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; + .handle_unicode_escape(actual_escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; } else { // Single Unicode escape - normal processing self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; } } @@ -228,6 +246,10 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { } fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + log::debug!("[SLICE] handle_simple_escape_char: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); + self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index 7bc29e3..033857e 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -589,6 +589,18 @@ mod tests { #[test] fn test_unicode_escape_integration() { let input = r#"{"key": "Hello\u0041World"}"#; // \u0041 = 'A' + println!("=== SLICE PARSER DEBUG ==="); + println!("Input: {}", input); + println!("Expected in string: 'HelloAWorld'"); + + // Print byte positions for the string part + let string_start = input.find("Hello").unwrap(); + let string_part = &input[string_start..]; + println!("String part: '{}'", string_part); + for (i, &byte) in string_part.as_bytes().iter().enumerate() { + println!(" pos {}: '{}' ({:02x})", string_start + i, byte as char, byte); + } + let mut scratch = [0u8; 1024]; let mut parser = SliceParser::with_buffer(input, &mut scratch); @@ -598,6 +610,13 @@ mod tests { // The string with Unicode escape should be unescaped match parser.next_event().unwrap() { Event::String(String::Unescaped(s)) => { + println!("=== RESULT ANALYSIS ==="); + println!("Expected: 'HelloAWorld'"); + println!("Got: '{}'", s); + println!("Character breakdown:"); + for (i, ch) in s.char_indices() { + println!(" [{}] '{}' = U+{:04X}", i, ch, ch as u32); + } assert_eq!(s, "HelloAWorld"); } other => panic!("Expected unescaped string value, got: {:?}", other), diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index 3b4ef7e..8372ce9 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -21,6 +21,8 @@ pub struct StreamContentBuilder<'b> { unescaped_reset_queued: bool, /// Flag to track when we're inside a Unicode escape sequence (collecting hex digits) in_unicode_escape: bool, + /// Flag to track when we're inside ANY escape sequence (like old implementation) + in_escape_sequence: bool, } impl<'b> StreamContentBuilder<'b> { @@ -32,6 +34,7 @@ impl<'b> StreamContentBuilder<'b> { unicode_escape_collector: UnicodeEscapeCollector::new(), unescaped_reset_queued: false, in_unicode_escape: false, + in_escape_sequence: false, } } @@ -136,12 +139,17 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { } fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + log::debug!("[NEW] handle_simple_escape: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); + self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) } fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError> { + log::debug!("STREAM CONTENT BUILDER handle_unicode_escape called with utf8_bytes={:?}", utf8_bytes); // StreamParser handles all escape sequences the same way - append bytes to escape buffer for &byte in utf8_bytes { self.stream_buffer @@ -156,51 +164,26 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); if in_string_mode { - // CRITICAL FIX: If we're inside a Unicode escape sequence, route hex digits to the collector - if self.in_unicode_escape { - log::debug!( - "[NEW] Unicode escape hex digit: {:02x} ('{}')", - byte, - byte as char - ); - - // Try to add the hex digit to the collector - let is_complete = self.unicode_escape_collector.add_hex_digit(byte)?; - - if is_complete { - log::debug!("[NEW] Unicode escape sequence complete, processing to UTF-8"); - - // Process the complete sequence to UTF-8 - let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, _) = self - .unicode_escape_collector - .process_to_utf8(&mut utf8_buf)?; - - // Write UTF-8 bytes to escape buffer if we have them - if let Some(utf8_bytes) = utf8_bytes_opt { - for &utf8_byte in utf8_bytes { - self.stream_buffer - .append_unescaped_byte(utf8_byte) - .map_err(ParseError::from)?; - } - } - - // Clear the Unicode escape state - we'll get End(UnicodeEscape) event next - self.in_unicode_escape = false; - } - - return Ok(()); - } - - // Normal literal byte processing (not inside Unicode escape) - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if self.stream_buffer.has_unescaped_content() + log::debug!("[NEW] append_literal_byte: byte={:02x} ('{}'), in_escape_sequence={}, in_unicode_escape={}, has_unescaped={}, pos={}", + byte, byte as char, self.in_escape_sequence, self.in_unicode_escape, + self.stream_buffer.has_unescaped_content(), self.stream_buffer.current_position()); + + // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { + log::debug!("[NEW] Writing literal byte to escape buffer: {:02x}", byte); self.stream_buffer .append_unescaped_byte(byte) .map_err(ParseError::from)?; + } else { + log::debug!("[NEW] Skipping literal byte (in_escape_sequence={}, in_unicode_escape={}, has_unescaped={}, pending_surrogate={})", + self.in_escape_sequence, self.in_unicode_escape, + self.stream_buffer.has_unescaped_content(), + self.unicode_escape_collector.has_pending_high_surrogate()); } } @@ -209,6 +192,8 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] begin_escape_sequence() called"); + self.in_escape_sequence = true; + log::debug!("[NEW] begin_escape_sequence: Setting in_escape_sequence=true at pos {}", self.stream_buffer.current_position()); self.start_escape_processing() } @@ -322,8 +307,21 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { fn parser_state(&self) -> &State { &self.parser_state } + + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { + // Called when Begin(UnicodeEscape) is received + self.in_unicode_escape = true; + self.in_escape_sequence = true; + log::debug!("[NEW] begin_unicode_escape: Setting in_unicode_escape=true and in_escape_sequence=true at pos {}", self.stream_buffer.current_position()); + Ok(()) + } fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Reset the escape flags + self.in_unicode_escape = false; + self.in_escape_sequence = false; + log::debug!("[NEW] process_unicode_escape_with_collector: Clearing escape flags at pos {}", self.stream_buffer.current_position()); + // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts let utf8_bytes_result = { let current_pos = self.stream_buffer.current_position(); @@ -334,13 +332,15 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { }; let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, _escape_start_pos) = + let (utf8_bytes_opt, escape_start_pos) = crate::escape_processor::process_unicode_escape_sequence( current_pos, &mut self.unicode_escape_collector, hex_slice_provider, &mut utf8_buf, )?; + log::debug!("STREAM CONTENT BUILDER process_unicode_escape_with_collector: current_pos={}, escape_start_pos={}, utf8_bytes_opt={:?}", + current_pos, escape_start_pos, utf8_bytes_opt); // Copy UTF-8 bytes to avoid borrow conflicts utf8_bytes_opt.map(|bytes| { @@ -370,6 +370,10 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { } fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + log::debug!("[NEW] handle_simple_escape_char: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); + self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) @@ -380,9 +384,11 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); if in_string_mode { - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if self.stream_buffer.has_unescaped_content() + // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { self.stream_buffer @@ -396,7 +402,9 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] EscapeHandler::begin_escape_sequence() called"); - // Delegate to ContentBuilder implementation + self.in_escape_sequence = true; + log::debug!("[NEW] EscapeHandler::begin_escape_sequence: Setting in_escape_sequence=true at pos {}", self.stream_buffer.current_position()); + // Delegate to ContentBuilder implementation ContentBuilder::begin_escape_sequence(self) } } diff --git a/picojson/tests/stream_parser_stress_test.rs b/picojson/tests/stream_parser_stress_test.rs index 1c17637..6c85377 100644 --- a/picojson/tests/stream_parser_stress_test.rs +++ b/picojson/tests/stream_parser_stress_test.rs @@ -214,6 +214,53 @@ fn test_stress_buffer_sizes_with_full_reads() { } } +#[test] +fn debug_unicode_issue() { + let json = br#"["a\nb\t\"\\c\u1234d"]"#; + let reader = ChunkReader::new(json, 5); + let mut buffer = [0u8; 30]; + + let mut parser = StreamParser::new(reader, &mut buffer); + let mut event_count = 0; + + loop { + let event = match parser.next_event() { + Ok(event) => event, + Err(e) => { + println!("Parse error: {:?}", e); + break; + } + }; + + println!("Event[{}]: {:?}", event_count, event); + + // Check string content when we encounter it + if let Event::String(s) = &event { + match s { + picojson::String::Unescaped(content) => { + println!("String Content Analysis:"); + println!(" Expected: 'a\\nb\\t\\\"\\\\cሴd'"); + println!(" Got: '{}'", content); + println!(" Character codes:"); + for (i, ch) in content.char_indices() { + println!(" [{}] '{}' = U+{:04X}", i, ch, ch as u32); + } + }, + picojson::String::Borrowed(content) => { + println!("Borrowed String: '{}'", content); + } + } + } + + event_count += 1; + + match event { + Event::EndDocument => break, + _ => {} + } + } +} + #[test] fn test_stress_chunk_sizes_with_adequate_buffer() { let scenarios = get_test_scenarios(); @@ -283,7 +330,7 @@ fn test_stress_matrix_critical_sizes() { } } -#[test] +#[test_log::test] fn test_stress_variable_read_sizes() { let scenarios = get_test_scenarios(); let patterns: &[&[usize]] = &[&[1, 5, 2], &[7, 1, 1, 10], &[1]]; From 9f3cfb7e4eb6991694795cf855cd281f9eb9c70a Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Sun, 13 Jul 2025 23:34:12 -0700 Subject: [PATCH 03/10] Fixups --- picojson/src/escape_processor.rs | 21 +++-- picojson/src/event_processor.rs | 6 +- picojson/src/slice_content_builder.rs | 28 +++--- picojson/src/slice_parser.rs | 11 ++- picojson/src/stream_content_builder.rs | 56 +++++++----- picojson/tests/debug_root_numbers.rs | 94 ++++++++++++++++++++- picojson/tests/stream_parser_stress_test.rs | 14 +-- 7 files changed, 180 insertions(+), 50 deletions(-) diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs index c69a2be..3fef114 100644 --- a/picojson/src/escape_processor.rs +++ b/picojson/src/escape_processor.rs @@ -150,8 +150,8 @@ impl EscapeProcessor { let digit = Self::validate_hex_digit(byte)?; codepoint = (codepoint << 4) | digit; } - - log::debug!("NEW process_unicode_escape: hex_slice={:?}, codepoint=0x{:04X}, pending_high_surrogate={:?}", + + log::debug!("NEW process_unicode_escape: hex_slice={:?}, codepoint=0x{:04X}, pending_high_surrogate={:?}", hex_slice, codepoint, pending_high_surrogate); // Check if we have a pending high surrogate @@ -162,7 +162,7 @@ impl EscapeProcessor { let combined = Self::combine_surrogate_pair(high, codepoint)?; let ch = char::from_u32(combined).ok_or(ParseError::InvalidUnicodeCodepoint)?; let utf8_str = ch.encode_utf8(utf8_buffer); - log::debug!("NEW process_unicode_escape: Surrogate pair combined: high=0x{:04X}, low=0x{:04X}, combined=0x{:04X}, utf8={:?}", + log::debug!("NEW process_unicode_escape: Surrogate pair combined: high=0x{:04X}, low=0x{:04X}, combined=0x{:04X}, utf8={:?}", high, codepoint, combined, utf8_str.as_bytes()); Ok((Some(utf8_str.as_bytes()), None)) } else { @@ -174,7 +174,10 @@ impl EscapeProcessor { // No pending high surrogate if Self::is_high_surrogate(codepoint) { // Save this high surrogate for the next escape - log::debug!("NEW process_unicode_escape: High surrogate saved: 0x{:04X}", codepoint); + log::debug!( + "NEW process_unicode_escape: High surrogate saved: 0x{:04X}", + codepoint + ); Ok((None, Some(codepoint))) } else if Self::is_low_surrogate(codepoint) { // Error: low surrogate without preceding high surrogate @@ -184,7 +187,11 @@ impl EscapeProcessor { // Regular Unicode character let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?; let utf8_str = ch.encode_utf8(utf8_buffer); - log::debug!("NEW process_unicode_escape: Regular Unicode character: 0x{:04X} -> {:?}", codepoint, utf8_str.as_bytes()); + log::debug!( + "NEW process_unicode_escape: Regular Unicode character: 0x{:04X} -> {:?}", + codepoint, + utf8_str.as_bytes() + ); Ok((Some(utf8_str.as_bytes()), None)) } } @@ -679,8 +686,8 @@ where // Extract the 4 hex digits from the buffer using the provider let hex_slice = hex_slice_provider(hex_start, hex_end)?; - - log::debug!("NEW process_unicode_escape_sequence: current_pos={}, hex_start={}, hex_end={}, escape_start_pos={}, hex_slice={:?}", + + log::debug!("NEW process_unicode_escape_sequence: current_pos={}, hex_start={}, hex_end={}, escape_start_pos={}, hex_slice={:?}", current_pos, hex_start, hex_end, escape_start_pos, hex_slice); if hex_slice.len() != 4 { diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 9fb445c..9a02886 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -30,7 +30,7 @@ pub trait EscapeHandler { fn append_literal_byte(&mut self, _byte: u8) -> Result<(), crate::ParseError> { Ok(()) } - + /// Begin unicode escape sequence processing /// Default implementation is no-op - suitable for parsers that don't need special handling fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { @@ -283,7 +283,9 @@ pub fn process_unicode_escape_events( content_extractor.begin_unicode_escape()?; } _ => { - log::debug!("NEW event_processor: Ignoring unicode escape outside string/key context"); + log::debug!( + "NEW event_processor: Ignoring unicode escape outside string/key context" + ); } // Ignore if not in string/key context } Ok(true) // Event was handled diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs index 17b4720..e4691ae 100644 --- a/picojson/src/slice_content_builder.rs +++ b/picojson/src/slice_content_builder.rs @@ -57,8 +57,11 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { // Clear the escape sequence flag when simple escape completes self.in_escape_sequence = false; - log::debug!("[SLICE] handle_simple_escape: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); - + log::debug!( + "[SLICE] handle_simple_escape: Clearing in_escape_sequence flag, escape_char={:02x}", + escape_char + ); + self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) @@ -130,12 +133,13 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { let at_document_end = self.buffer.is_empty(); log::debug!("[NEW] SliceContentBuilder extract_number: start_pos={}, from_container_end={}, finished={} (ignored), buffer_empty={}, at_document_end={}", start_pos, from_container_end, finished, self.buffer.is_empty(), at_document_end); - crate::number_parser::parse_number_with_delimiter_logic( - &self.buffer, - start_pos, - from_container_end, - at_document_end, - ) + + // SliceParser-specific fix: at document end, current_pos points past the input, + // so we need to adjust the logic to always exclude the delimiter position + let current_pos = self.buffer.current_position(); + let end_pos = current_pos.saturating_sub(1); + log::debug!("[NEW] SliceContentBuilder using SliceParser-specific end_pos: current_pos={}, end_pos={}", current_pos, end_pos); + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) } fn current_position(&self) -> usize { @@ -205,7 +209,9 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { // Clear the escape sequence flag when unicode escape completes self.in_escape_sequence = false; - log::debug!("[SLICE] process_unicode_escape_with_collector: Clearing in_escape_sequence flag"); + log::debug!( + "[SLICE] process_unicode_escape_with_collector: Clearing in_escape_sequence flag" + ); let current_pos = self.buffer.current_pos(); let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); @@ -220,7 +226,7 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { hex_slice_provider, &mut utf8_buf, )?; - // Fix: escape_start_pos should point to the backslash, not the 'u' + // Fix: escape_start_pos should point to the backslash, not the 'u' // But don't subtract if it's already pointing to the backslash let actual_escape_start_pos = escape_start_pos; log::debug!("SLICE CONTENT BUILDER process_unicode_escape_with_collector_impl: using actual_escape_start_pos={} (was {})", actual_escape_start_pos, escape_start_pos); @@ -249,7 +255,7 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { // Clear the escape sequence flag when simple escape completes self.in_escape_sequence = false; log::debug!("[SLICE] handle_simple_escape_char: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); - + self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index 033857e..3340ace 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -592,15 +592,20 @@ mod tests { println!("=== SLICE PARSER DEBUG ==="); println!("Input: {}", input); println!("Expected in string: 'HelloAWorld'"); - + // Print byte positions for the string part let string_start = input.find("Hello").unwrap(); let string_part = &input[string_start..]; println!("String part: '{}'", string_part); for (i, &byte) in string_part.as_bytes().iter().enumerate() { - println!(" pos {}: '{}' ({:02x})", string_start + i, byte as char, byte); + println!( + " pos {}: '{}' ({:02x})", + string_start + i, + byte as char, + byte + ); } - + let mut scratch = [0u8; 1024]; let mut parser = SliceParser::with_buffer(input, &mut scratch); diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index 8372ce9..eec8256 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -141,15 +141,21 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { // Clear the escape sequence flag when simple escape completes self.in_escape_sequence = false; - log::debug!("[NEW] handle_simple_escape: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); - + log::debug!( + "[NEW] handle_simple_escape: Clearing in_escape_sequence flag, escape_char={:02x}", + escape_char + ); + self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) } fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError> { - log::debug!("STREAM CONTENT BUILDER handle_unicode_escape called with utf8_bytes={:?}", utf8_bytes); + log::debug!( + "STREAM CONTENT BUILDER handle_unicode_escape called with utf8_bytes={:?}", + utf8_bytes + ); // StreamParser handles all escape sequences the same way - append bytes to escape buffer for &byte in utf8_bytes { self.stream_buffer @@ -164,14 +170,14 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); if in_string_mode { - log::debug!("[NEW] append_literal_byte: byte={:02x} ('{}'), in_escape_sequence={}, in_unicode_escape={}, has_unescaped={}, pos={}", - byte, byte as char, self.in_escape_sequence, self.in_unicode_escape, + log::debug!("[NEW] append_literal_byte: byte={:02x} ('{}'), in_escape_sequence={}, in_unicode_escape={}, has_unescaped={}, pos={}", + byte, byte as char, self.in_escape_sequence, self.in_unicode_escape, self.stream_buffer.has_unescaped_content(), self.stream_buffer.current_position()); - + // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer // when inside ANY escape sequence (in_escape_sequence == true) // This prevents hex digits from being accumulated as literal text - if !self.in_escape_sequence + if !self.in_escape_sequence && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { @@ -180,8 +186,8 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { .append_unescaped_byte(byte) .map_err(ParseError::from)?; } else { - log::debug!("[NEW] Skipping literal byte (in_escape_sequence={}, in_unicode_escape={}, has_unescaped={}, pending_surrogate={})", - self.in_escape_sequence, self.in_unicode_escape, + log::debug!("[NEW] Skipping literal byte (in_escape_sequence={}, in_unicode_escape={}, has_unescaped={}, pending_surrogate={})", + self.in_escape_sequence, self.in_unicode_escape, self.stream_buffer.has_unescaped_content(), self.unicode_escape_collector.has_pending_high_surrogate()); } @@ -193,7 +199,10 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] begin_escape_sequence() called"); self.in_escape_sequence = true; - log::debug!("[NEW] begin_escape_sequence: Setting in_escape_sequence=true at pos {}", self.stream_buffer.current_position()); + log::debug!( + "[NEW] begin_escape_sequence: Setting in_escape_sequence=true at pos {}", + self.stream_buffer.current_position() + ); self.start_escape_processing() } @@ -307,7 +316,7 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { fn parser_state(&self) -> &State { &self.parser_state } - + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { // Called when Begin(UnicodeEscape) is received self.in_unicode_escape = true; @@ -320,8 +329,11 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { // Reset the escape flags self.in_unicode_escape = false; self.in_escape_sequence = false; - log::debug!("[NEW] process_unicode_escape_with_collector: Clearing escape flags at pos {}", self.stream_buffer.current_position()); - + log::debug!( + "[NEW] process_unicode_escape_with_collector: Clearing escape flags at pos {}", + self.stream_buffer.current_position() + ); + // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts let utf8_bytes_result = { let current_pos = self.stream_buffer.current_position(); @@ -339,7 +351,7 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { hex_slice_provider, &mut utf8_buf, )?; - log::debug!("STREAM CONTENT BUILDER process_unicode_escape_with_collector: current_pos={}, escape_start_pos={}, utf8_bytes_opt={:?}", + log::debug!("STREAM CONTENT BUILDER process_unicode_escape_with_collector: current_pos={}, escape_start_pos={}, utf8_bytes_opt={:?}", current_pos, escape_start_pos, utf8_bytes_opt); // Copy UTF-8 bytes to avoid borrow conflicts @@ -372,8 +384,11 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { // Clear the escape sequence flag when simple escape completes self.in_escape_sequence = false; - log::debug!("[NEW] handle_simple_escape_char: Clearing in_escape_sequence flag, escape_char={:02x}", escape_char); - + log::debug!( + "[NEW] handle_simple_escape_char: Clearing in_escape_sequence flag, escape_char={:02x}", + escape_char + ); + self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) @@ -387,7 +402,7 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer // when inside ANY escape sequence (in_escape_sequence == true) // This prevents hex digits from being accumulated as literal text - if !self.in_escape_sequence + if !self.in_escape_sequence && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { @@ -403,8 +418,11 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] EscapeHandler::begin_escape_sequence() called"); self.in_escape_sequence = true; - log::debug!("[NEW] EscapeHandler::begin_escape_sequence: Setting in_escape_sequence=true at pos {}", self.stream_buffer.current_position()); - // Delegate to ContentBuilder implementation + log::debug!( + "[NEW] EscapeHandler::begin_escape_sequence: Setting in_escape_sequence=true at pos {}", + self.stream_buffer.current_position() + ); + // Delegate to ContentBuilder implementation ContentBuilder::begin_escape_sequence(self) } } diff --git a/picojson/tests/debug_root_numbers.rs b/picojson/tests/debug_root_numbers.rs index fa394b4..e19feed 100644 --- a/picojson/tests/debug_root_numbers.rs +++ b/picojson/tests/debug_root_numbers.rs @@ -1,5 +1,5 @@ // Debug root-level number parsing issue -use picojson::{Event, PullParser, SliceParser}; +use picojson::{ChunkReader, Event, PullParser, SliceParser, StreamParser}; fn test_json(input: &str, description: &str) { println!("\n=== Testing: {} ===", description); @@ -46,3 +46,95 @@ fn debug_root_level_numbers() { test_json("[42]", "Small number in array"); test_json("[9999999999]", "Large number in array"); } + +#[test] +fn debug_lonely_int() { + println!("=== DEBUGGING LONELY INT: '42' ==="); + + // Test with SliceParser with debug logging + println!("\n--- SliceParser with RUST_LOG=debug ---"); + let input = "42"; + let mut scratch = [0u8; 1024]; + let mut parser = SliceParser::with_buffer(input, &mut scratch); + + match parser.next_event() { + Ok(event) => { + println!("SliceParser event: {:?}", event); + if let Event::Number(num) = &event { + println!("Number str: '{}'", num.as_str()); + println!("Number as_int: {:?}", num.as_int()); + } + } + Err(e) => { + println!("SliceParser error: {:?}", e); + } + } + + // Test with StreamParser + println!("\n--- StreamParser ---"); + let input = "42"; + let mut scratch = [0u8; 1024]; + let reader = ChunkReader::new(input.as_bytes(), 1024); + let mut parser = StreamParser::new(reader, &mut scratch); + + match parser.next_event() { + Ok(event) => { + println!("StreamParser event: {:?}", event); + if let Event::Number(num) = &event { + println!("Number str: '{}'", num.as_str()); + println!("Number as_int: {:?}", num.as_int()); + } + } + Err(e) => { + println!("StreamParser error: {:?}", e); + } + } +} + +#[test] +fn debug_lonely_negative_real() { + println!("=== DEBUGGING LONELY NEGATIVE REAL: '-0.1' ==="); + + // Test with SliceParser + println!("\n--- SliceParser ---"); + let input = "-0.1"; + let mut scratch = [0u8; 1024]; + let mut parser = SliceParser::with_buffer(input, &mut scratch); + + match parser.next_event() { + Ok(event) => { + println!("SliceParser event: {:?}", event); + if let Event::Number(num) = &event { + println!("Number str: '{}'", num.as_str()); + println!("Number as_int: {:?}", num.as_int()); + #[cfg(feature = "float")] + println!("Number as_f64: {:?}", num.as_f64()); + } + } + Err(e) => { + println!("SliceParser error: {:?}", e); + } + } + + // Test with StreamParser + println!("\n--- StreamParser ---"); + let input = "-0.1"; + let mut scratch = [0u8; 1024]; + let reader = ChunkReader::new(input.as_bytes(), 1024); + let mut parser = StreamParser::new(reader, &mut scratch); + + match parser.next_event() { + Ok(event) => { + println!("StreamParser event: {:?}", event); + if let Event::Number(num) = &event { + println!("Number str: '{}'", num.as_str()); + println!("Number as_int: {:?}", num.as_int()); + #[cfg(feature = "float")] + println!("Number as_f64: {:?}", num.as_f64()); + } + } + Err(e) => { + println!("StreamParser error: {:?}", e); + } + } +} diff --git a/picojson/tests/stream_parser_stress_test.rs b/picojson/tests/stream_parser_stress_test.rs index 6c85377..d0586b0 100644 --- a/picojson/tests/stream_parser_stress_test.rs +++ b/picojson/tests/stream_parser_stress_test.rs @@ -219,10 +219,10 @@ fn debug_unicode_issue() { let json = br#"["a\nb\t\"\\c\u1234d"]"#; let reader = ChunkReader::new(json, 5); let mut buffer = [0u8; 30]; - + let mut parser = StreamParser::new(reader, &mut buffer); let mut event_count = 0; - + loop { let event = match parser.next_event() { Ok(event) => event, @@ -231,9 +231,9 @@ fn debug_unicode_issue() { break; } }; - + println!("Event[{}]: {:?}", event_count, event); - + // Check string content when we encounter it if let Event::String(s) = &event { match s { @@ -245,15 +245,15 @@ fn debug_unicode_issue() { for (i, ch) in content.char_indices() { println!(" [{}] '{}' = U+{:04X}", i, ch, ch as u32); } - }, + } picojson::String::Borrowed(content) => { println!("Borrowed String: '{}'", content); } } } - + event_count += 1; - + match event { Event::EndDocument => break, _ => {} From caf2d88bdf89da162587a4ff80cafcbbf44a83d4 Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Mon, 14 Jul 2025 20:07:33 -0700 Subject: [PATCH 04/10] Bigger refactor --- picojson/build.rs | 9 +- picojson/src/chunk_reader.rs | 2 +- picojson/src/content_builder.rs | 1 + picojson/src/event_processor.rs | 16 +++- picojson/src/json_number.rs | 4 +- picojson/src/number_parser.rs | 4 +- picojson/src/parse_error.rs | 6 +- picojson/src/parser_core.rs | 71 +++++++-------- picojson/src/shared.rs | 2 - picojson/src/slice_content_builder.rs | 72 ++++++++++----- picojson/src/slice_parser.rs | 117 ++---------------------- picojson/src/stream_content_builder.rs | 120 +++++++++++++------------ picojson/src/stream_parser.rs | 14 +-- 13 files changed, 187 insertions(+), 251 deletions(-) diff --git a/picojson/build.rs b/picojson/build.rs index 03506da..20bfee3 100644 --- a/picojson/build.rs +++ b/picojson/build.rs @@ -79,8 +79,9 @@ fn generate_conformance_tests() -> Result<(), Box> { let generated_code = format!( r#"// Generated by build.rs - DO NOT EDIT +#![allow(non_snake_case)] // Keep test names discoverable and matching original JSON test suite + #[cfg(feature = "remote-tests")] -#[allow(clippy::non_snake_case)] // Keep test names discoverable and matching original JSON test suite mod conformance_generated {{ use picojson::{{Event, ParseError, PullParser, SliceParser}}; @@ -129,6 +130,7 @@ mod conformance_generated {{ Ok(()) } +#[allow(dead_code)] fn sanitize_test_name( filename: &str, test_name_counts: &mut std::collections::HashMap, @@ -156,23 +158,26 @@ fn sanitize_test_name( *count += 1; if *count > 1 { - format!("{}_{}", result, count) + format!("{result}_{count}") } else { result } } // Configuration from Cargo.toml metadata +#[allow(dead_code)] fn get_jsontest_suite_url() -> String { std::env::var("CARGO_PKG_METADATA_CONFORMANCE_TESTS_JSONTEST_SUITE_URL") .unwrap_or_else(|_| "https://github.com/nst/JSONTestSuite/archive/{commit}.zip".to_string()) } +#[allow(dead_code)] fn get_jsontest_suite_commit() -> String { std::env::var("CARGO_PKG_METADATA_CONFORMANCE_TESTS_JSONTEST_SUITE_COMMIT") .unwrap_or_else(|_| "1ef36fa01286573e846ac449e8683f8833c5b26a".to_string()) } +#[allow(dead_code)] fn get_json_checker_url() -> String { std::env::var("CARGO_PKG_METADATA_CONFORMANCE_TESTS_JSON_CHECKER_URL") .unwrap_or_else(|_| "https://www.json.org/JSON_checker/test.zip".to_string()) diff --git a/picojson/src/chunk_reader.rs b/picojson/src/chunk_reader.rs index a72892b..d09200e 100644 --- a/picojson/src/chunk_reader.rs +++ b/picojson/src/chunk_reader.rs @@ -119,7 +119,7 @@ impl<'a> ChunkReader<'a> { } } -impl<'a> Reader for ChunkReader<'a> { +impl Reader for ChunkReader<'_> { type Error = (); fn read(&mut self, buf: &mut [u8]) -> Result { diff --git a/picojson/src/content_builder.rs b/picojson/src/content_builder.rs index 4be88c4..4698732 100644 --- a/picojson/src/content_builder.rs +++ b/picojson/src/content_builder.rs @@ -14,6 +14,7 @@ use crate::{Event, ParseError, String}; /// This trait combines the functionality of `ContentExtractor` and `EscapeHandler` into a /// unified interface that can be implemented by different parser backends while sharing /// the core event processing logic. +#[allow(dead_code)] // Methods are part of trait interface design pub trait ContentBuilder: ContentExtractor + EscapeHandler { /// Begin processing a new string or key at the given position /// diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 85a2a87..fe1a23b 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -9,6 +9,7 @@ use crate::ujson::EventToken; use crate::{Event, ParseError}; /// Escape handling trait for abstracting escape sequence processing between parsers +#[allow(dead_code)] // Methods are part of trait interface design pub trait EscapeHandler { /// Get the current parser state for escape context checking fn parser_state(&self) -> &crate::shared::State; @@ -30,6 +31,12 @@ pub trait EscapeHandler { fn append_literal_byte(&mut self, _byte: u8) -> Result<(), crate::ParseError> { Ok(()) } + + /// Begin unicode escape sequence processing + /// Default implementation is no-op - suitable for parsers that don't need special handling + fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { + Ok(()) + } } /// Result of processing a tokenizer event @@ -65,7 +72,7 @@ pub fn process_begin_events( } crate::ujson::Event::Begin(EventToken::String) => { let pos = content_extractor.current_position(); - log::debug!("[NEW] Begin(String): storing pos={} in State::String", pos); + log::debug!("[NEW] Begin(String): storing pos={pos} in State::String"); *content_extractor.parser_state_mut() = State::String(pos); content_extractor.begin_string_content(pos); Some(EventResult::Continue) @@ -190,9 +197,9 @@ pub trait ContentExtractor: EscapeHandler { /// /// This callback stores tokenizer events in the parser's event array, filling the first /// available slot. This pattern is identical across both SliceParser and StreamParser. -pub fn create_tokenizer_callback<'a>( - event_storage: &'a mut [Option; 2], -) -> impl FnMut(crate::ujson::Event, usize) + 'a { +pub fn create_tokenizer_callback( + event_storage: &mut [Option; 2], +) -> impl FnMut(crate::ujson::Event, usize) + '_ { |event, _len| { for evt in event_storage.iter_mut() { if evt.is_none() { @@ -272,6 +279,7 @@ pub fn process_unicode_escape_events( match content_extractor.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { content_extractor.unicode_escape_collector_mut().reset(); + content_extractor.begin_unicode_escape()?; } _ => {} // Ignore if not in string/key context } diff --git a/picojson/src/json_number.rs b/picojson/src/json_number.rs index dcd024d..152b2ab 100644 --- a/picojson/src/json_number.rs +++ b/picojson/src/json_number.rs @@ -143,9 +143,9 @@ impl core::fmt::Display for JsonNumber<'_, '_> { JsonNumber::Copied { raw, parsed } => (raw, parsed), }; match parsed { - NumberResult::Integer(val) => write!(f, "{}", val), + NumberResult::Integer(val) => write!(f, "{val}"), #[cfg(feature = "float")] - NumberResult::Float(val) => write!(f, "{}", val), + NumberResult::Float(val) => write!(f, "{val}"), #[cfg(all(not(feature = "float"), feature = "float-truncate"))] NumberResult::FloatTruncated(val) => write!(f, "{}", val), // For overflow, disabled, or skipped cases, show the exact raw string diff --git a/picojson/src/number_parser.rs b/picojson/src/number_parser.rs index daafa0f..83637f7 100644 --- a/picojson/src/number_parser.rs +++ b/picojson/src/number_parser.rs @@ -51,8 +51,7 @@ pub fn parse_number_with_delimiter_logic( let use_full_span = !from_container_end && at_document_end; let end_pos = crate::shared::ContentRange::number_end_position(current_pos, use_full_span); - log::debug!("[NEW] parse_number_with_delimiter_logic: start_pos={}, current_pos={}, from_container_end={}, at_document_end={}, use_full_span={}, end_pos={}", - start_pos, current_pos, from_container_end, at_document_end, use_full_span, end_pos); + log::debug!("[NEW] parse_number_with_delimiter_logic: start_pos={start_pos}, current_pos={current_pos}, from_container_end={from_container_end}, at_document_end={at_document_end}, use_full_span={use_full_span}, end_pos={end_pos}"); parse_number_event(extractor, start_pos, end_pos) } @@ -64,6 +63,7 @@ pub fn parse_number_with_delimiter_logic( /// 2. Convert to UTF-8 string /// 3. Parse using shared number parsing logic /// 4. Create JsonNumber::Borrowed event +/// /// All position logic is handled by the calling parser. pub fn parse_number_event( extractor: &T, diff --git a/picojson/src/parse_error.rs b/picojson/src/parse_error.rs index 0a004c9..15801e5 100644 --- a/picojson/src/parse_error.rs +++ b/picojson/src/parse_error.rs @@ -76,9 +76,9 @@ impl From for ParseError { impl core::fmt::Display for ParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { - ParseError::TokenizerError(e) => write!(f, "{}", e), - ParseError::InvalidUtf8(e) => write!(f, "Invalid UTF-8: {}", e), - _ => write!(f, "{:?}", self), + ParseError::TokenizerError(e) => write!(f, "{e}"), + ParseError::InvalidUtf8(e) => write!(f, "Invalid UTF-8: {e}"), + _ => write!(f, "{self:?}"), } } } diff --git a/picojson/src/parser_core.rs b/picojson/src/parser_core.rs index 2149579..7b3f914 100644 --- a/picojson/src/parser_core.rs +++ b/picojson/src/parser_core.rs @@ -16,6 +16,10 @@ use crate::shared::{ByteProvider, Event, ParserState, UnexpectedState}; use crate::ujson::{EventToken, Tokenizer}; use crate::{ujson, ParseError}; +/// Combined trait for parsers that provide both byte access and content building +pub trait ParserProvider: ByteProvider + ContentBuilder {} +impl ParserProvider for T {} + /// The core parser logic that handles the unified event processing loop. /// /// This struct contains all the shared state and logic that was previously @@ -37,25 +41,30 @@ impl ParserCore { } } - /// The unified event processing loop that was previously duplicated - /// between SliceParser and StreamParser. - /// - /// This method implements the common logic while delegating to traits - /// for the parser-specific differences. - pub fn next_event_impl<'a, B, CB>( + /// Unified implementation that works with a single combined provider. + /// This avoids borrowing conflicts by using a single object that implements both traits. + pub fn next_event_impl_unified<'a, P>( &mut self, - byte_provider: &mut B, - content_builder: &'a mut CB, + provider: &'a mut P, escape_timing: EscapeTiming, ) -> Result, ParseError> where - B: ByteProvider, - CB: ContentBuilder, + P: ParserProvider, { loop { while !have_events(&self.parser_state.evts) { - if !self.pull_tokenizer_events(byte_provider)? { - return Ok(Event::EndDocument); + if let Some(byte) = provider.next_byte()? { + process_byte_through_tokenizer( + byte, + &mut self.tokenizer, + &mut self.parser_state.evts, + )?; + } else { + finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; + + if !have_events(&self.parser_state.evts) { + return Ok(Event::EndDocument); + } } } @@ -65,17 +74,15 @@ impl ParserCore { }; // Try shared event processors first - if let Some(result) = process_simple_events(&taken) - .or_else(|| process_begin_events(&taken, content_builder)) + if let Some(result) = + process_simple_events(&taken).or_else(|| process_begin_events(&taken, provider)) { match result { EventResult::Complete(event) => return Ok(event), - EventResult::ExtractString => { - return content_builder.validate_and_extract_string() - } - EventResult::ExtractKey => return content_builder.validate_and_extract_key(), + EventResult::ExtractString => return provider.validate_and_extract_string(), + EventResult::ExtractKey => return provider.validate_and_extract_key(), EventResult::ExtractNumber(from_container_end) => { - return content_builder.validate_and_extract_number(from_container_end) + return provider.validate_and_extract_number(from_container_end) } EventResult::Continue => continue, } @@ -84,9 +91,9 @@ impl ParserCore { // Handle parser-specific events based on escape timing match taken { ujson::Event::Begin(EventToken::EscapeSequence) => { - process_begin_escape_sequence_event(content_builder)?; + process_begin_escape_sequence_event(provider)?; } - _ if process_unicode_escape_events(&taken, content_builder)? => { + _ if process_unicode_escape_events(&taken, provider)? => { // Unicode escape events handled by shared function } ujson::Event::Begin( @@ -102,7 +109,7 @@ impl ParserCore { // SliceParser-specific: Handle simple escape sequences on Begin events // because CopyOnEscape requires starting unescaping immediately when // the escape token begins to maintain zero-copy optimization - process_simple_escape_event(&escape_token, content_builder)?; + process_simple_escape_event(&escape_token, provider)?; } ujson::Event::End( escape_token @ (EventToken::EscapeQuote @@ -117,7 +124,7 @@ impl ParserCore { // StreamParser-specific: Handle simple escape sequences on End events // because StreamBuffer must wait until the token ends to accumulate // all bytes before processing the complete escape sequence - process_simple_escape_event(&escape_token, content_builder)?; + process_simple_escape_event(&escape_token, provider)?; } _ => { // All other events continue to next iteration @@ -125,24 +132,6 @@ impl ParserCore { } } } - - /// Pull events from tokenizer and return whether parsing should continue. - /// This implements the common tokenizer event pulling logic. - fn pull_tokenizer_events( - &mut self, - byte_provider: &mut B, - ) -> Result { - if let Some(byte) = byte_provider.next_byte()? { - process_byte_through_tokenizer(byte, &mut self.tokenizer, &mut self.parser_state.evts)?; - } else { - finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; - - if !have_events(&self.parser_state.evts) { - return Ok(false); // Signal end of parsing - } - } - Ok(true) // Continue parsing - } } /// Enum to specify when escape sequences should be processed diff --git a/picojson/src/shared.rs b/picojson/src/shared.rs index 58befe2..74de2d1 100644 --- a/picojson/src/shared.rs +++ b/picojson/src/shared.rs @@ -55,14 +55,12 @@ pub enum State { /// Parser state and event storage pub(super) struct ParserState { - pub state: State, pub evts: [Option; 2], } impl ParserState { pub fn new() -> Self { Self { - state: State::None, evts: core::array::from_fn(|_| None), } } diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs index da60145..ac3033a 100644 --- a/picojson/src/slice_content_builder.rs +++ b/picojson/src/slice_content_builder.rs @@ -21,6 +21,8 @@ pub struct SliceContentBuilder<'a, 'b> { parser_state: State, /// Unicode escape collector for \uXXXX sequences unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to track when we're inside ANY escape sequence (like stream implementation) + in_escape_sequence: bool, } impl<'a, 'b> SliceContentBuilder<'a, 'b> { @@ -31,6 +33,7 @@ impl<'a, 'b> SliceContentBuilder<'a, 'b> { copy_on_escape: CopyOnEscape::new(input, scratch_buffer), parser_state: State::None, unicode_escape_collector: UnicodeEscapeCollector::new(), + in_escape_sequence: false, } } @@ -45,13 +48,15 @@ impl<'a, 'b> SliceContentBuilder<'a, 'b> { } } -impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { +impl ContentBuilder for SliceContentBuilder<'_, '_> { fn begin_content(&mut self, pos: usize, _is_key: bool) { // SliceParser uses CopyOnEscape for content management self.copy_on_escape.begin_string(pos); } fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) @@ -62,6 +67,8 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { // The position calculation matches the existing SliceParser logic let current_pos = self.buffer.current_pos(); let (_, _, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos); + // Fix: escape_start_pos should point to the backslash, not the 'u' + let actual_escape_start_pos = escape_start_pos.saturating_sub(1); // Check if this is completing a surrogate pair let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); @@ -70,14 +77,14 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; + .handle_unicode_escape(actual_escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; } else { // Single Unicode escape - normal processing self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; } Ok(()) @@ -90,7 +97,8 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { } fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { - // SliceParser doesn't need special escape sequence setup + // Set escape flag to prevent literal byte accumulation during escape processing + self.in_escape_sequence = true; Ok(()) } @@ -119,12 +127,11 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { let at_document_end = self.buffer.is_empty(); log::debug!("[NEW] SliceContentBuilder extract_number: start_pos={}, from_container_end={}, finished={} (ignored), buffer_empty={}, at_document_end={}", start_pos, from_container_end, finished, self.buffer.is_empty(), at_document_end); - crate::number_parser::parse_number_with_delimiter_logic( - &self.buffer, - start_pos, - from_container_end, - at_document_end, - ) + // SliceParser-specific fix: at document end, current_pos points past the input, + // so we need to adjust the logic to always exclude the delimiter position + let current_pos = self.buffer.current_position(); + let end_pos = current_pos.saturating_sub(1); + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) } fn current_position(&self) -> usize { @@ -136,7 +143,7 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { } } -impl<'a, 'b> ContentExtractor for SliceContentBuilder<'a, 'b> { +impl ContentExtractor for SliceContentBuilder<'_, '_> { fn parser_state_mut(&mut self) -> &mut crate::shared::State { &mut self.parser_state } @@ -176,22 +183,38 @@ impl<'a, 'b> ContentExtractor for SliceContentBuilder<'a, 'b> { start_pos: usize, from_container_end: bool, ) -> Result, ParseError> { + // Use shared number parsing with SliceParser-specific document end detection + // SliceParser uses buffer-based detection: buffer empty indicates document end let at_document_end = self.buffer.is_empty(); - crate::number_parser::parse_number_with_delimiter_logic( - &self.buffer, - start_pos, - from_container_end, - at_document_end, - ) + log::debug!("[NEW] SliceContentBuilder extract_number_content: start_pos={}, from_container_end={}, buffer_empty={}, at_document_end={}", + start_pos, from_container_end, self.buffer.is_empty(), at_document_end); + // SliceParser-specific fix: at document end, current_pos points past the input, + // so we need to adjust the logic to always exclude the delimiter position + let current_pos = self.buffer.current_position(); + let end_pos = current_pos.saturating_sub(1); + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) + } +} + +impl crate::shared::ByteProvider for SliceContentBuilder<'_, '_> { + fn next_byte(&mut self) -> Result, crate::ParseError> { + use crate::slice_input_buffer::InputBuffer; + match self.buffer_mut().consume_byte() { + Ok(byte) => Ok(Some(byte)), + Err(crate::slice_input_buffer::Error::ReachedEnd) => Ok(None), + Err(err) => Err(err.into()), + } } } -impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { +impl EscapeHandler for SliceContentBuilder<'_, '_> { fn parser_state(&self) -> &crate::shared::State { &self.parser_state } fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Clear the escape sequence flag when unicode escape completes + self.in_escape_sequence = false; let current_pos = self.buffer.current_pos(); let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); @@ -206,6 +229,9 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { hex_slice_provider, &mut utf8_buf, )?; + // Fix: escape_start_pos should point to the backslash, not the 'u' + // But don't subtract if it's already pointing to the backslash + let actual_escape_start_pos = escape_start_pos; // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) if let Some(utf8_bytes) = utf8_bytes_opt { @@ -213,14 +239,14 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; + .handle_unicode_escape(actual_escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; } else { // Single Unicode escape - normal processing self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; } } @@ -228,6 +254,8 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { } fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index 7bc29e3..89bcba1 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -1,9 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 -use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::parse_error::ParseError; use crate::parser_core::ParserCore; -use crate::shared::{ByteProvider, Event, PullParser}; +use crate::shared::{Event, PullParser}; use crate::slice_content_builder::SliceContentBuilder; use crate::slice_input_buffer::InputBuffer; use crate::ujson; @@ -146,115 +145,15 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { /// Returns the next JSON event or an error if parsing fails. /// Parsing continues until `EndDocument` is returned or an error occurs. fn next_event_impl(&mut self) -> Result, ParseError> { - // We need to implement the unified loop locally to avoid borrowing conflicts - // This is essentially a copy of ParserCore::next_event_impl but accessing fields directly - loop { - while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { - if let Some(byte) = self.next_byte()? { - crate::event_processor::process_byte_through_tokenizer( - byte, - &mut self.parser_core.tokenizer, - &mut self.parser_core.parser_state.evts, - )?; - } else { - crate::event_processor::finish_tokenizer( - &mut self.parser_core.tokenizer, - &mut self.parser_core.parser_state.evts, - )?; - - if !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { - return Ok(Event::EndDocument); - } - } - } - - let taken_event = - crate::event_processor::take_first_event(&mut self.parser_core.parser_state.evts); - let Some(taken) = taken_event else { - return Err(crate::shared::UnexpectedState::StateMismatch.into()); - }; - - // Try shared event processors first - if let Some(result) = - crate::event_processor::process_simple_events(&taken).or_else(|| { - crate::event_processor::process_begin_events(&taken, &mut self.content_builder) - }) - { - match result { - crate::event_processor::EventResult::Complete(event) => return Ok(event), - crate::event_processor::EventResult::ExtractString => { - return self.content_builder.validate_and_extract_string() - } - crate::event_processor::EventResult::ExtractKey => { - return self.content_builder.validate_and_extract_key() - } - crate::event_processor::EventResult::ExtractNumber(from_container_end) => { - return self.extract_number_with_finished_state(from_container_end) - } - crate::event_processor::EventResult::Continue => continue, - } - } - - // Handle parser-specific events (SliceParser uses OnBegin timing) - match taken { - ujson::Event::Begin(crate::ujson::EventToken::EscapeSequence) => { - crate::event_processor::process_begin_escape_sequence_event( - &mut self.content_builder, - )?; - } - _ if crate::event_processor::process_unicode_escape_events( - &taken, - &mut self.content_builder, - )? => - { - // Unicode escape events handled by shared function - } - ujson::Event::Begin( - escape_token @ (crate::ujson::EventToken::EscapeQuote - | crate::ujson::EventToken::EscapeBackslash - | crate::ujson::EventToken::EscapeSlash - | crate::ujson::EventToken::EscapeBackspace - | crate::ujson::EventToken::EscapeFormFeed - | crate::ujson::EventToken::EscapeNewline - | crate::ujson::EventToken::EscapeCarriageReturn - | crate::ujson::EventToken::EscapeTab), - ) => { - // SliceParser-specific: Handle simple escape sequences on Begin events - // because CopyOnEscape requires starting unescaping immediately when - // the escape token begins to maintain zero-copy optimization - crate::event_processor::process_simple_escape_event( - &escape_token, - &mut self.content_builder, - )?; - } - _ => { - // All other events continue to next iteration - } - } - } - } - - /// Extract number with proper SliceParser document end detection - fn extract_number_with_finished_state( - &mut self, - from_container_end: bool, - ) -> Result, ParseError> { - let start_pos = match *self.content_builder.parser_state() { - crate::shared::State::Number(pos) => pos, - _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()), - }; - - *self.content_builder.parser_state_mut() = crate::shared::State::None; - - // SliceParser doesn't use the finished parameter - it uses buffer emptiness - // Pass finished=false since SliceParser handles document end differently - use crate::content_builder::ContentBuilder; - self.content_builder - .extract_number(start_pos, from_container_end, false) + // Use the unified ParserCore implementation with SliceParser-specific timing + self.parser_core.next_event_impl_unified( + &mut self.content_builder, + crate::parser_core::EscapeTiming::OnBegin, + ) } } -impl<'a, 'b, C: BitStackConfig> PullParser for SliceParser<'a, 'b, C> { +impl PullParser for SliceParser<'_, '_, C> { fn next_event(&mut self) -> Result, ParseError> { if self.content_builder.buffer().is_past_end() { return Ok(Event::EndDocument); @@ -263,7 +162,7 @@ impl<'a, 'b, C: BitStackConfig> PullParser for SliceParser<'a, 'b, C> { } } -impl<'a, 'b, C: BitStackConfig> crate::shared::ByteProvider for SliceParser<'a, 'b, C> { +impl crate::shared::ByteProvider for SliceParser<'_, '_, C> { fn next_byte(&mut self) -> Result, ParseError> { use crate::slice_input_buffer::InputBuffer; match self.content_builder.buffer_mut().consume_byte() { diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index 3b4ef7e..0f891ad 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -21,6 +21,10 @@ pub struct StreamContentBuilder<'b> { unescaped_reset_queued: bool, /// Flag to track when we're inside a Unicode escape sequence (collecting hex digits) in_unicode_escape: bool, + /// Flag to track when we're inside ANY escape sequence (like old implementation) + in_escape_sequence: bool, + /// Flag to track when the input stream has been finished (for number parsing) + finished: bool, } impl<'b> StreamContentBuilder<'b> { @@ -32,6 +36,8 @@ impl<'b> StreamContentBuilder<'b> { unicode_escape_collector: UnicodeEscapeCollector::new(), unescaped_reset_queued: false, in_unicode_escape: false, + in_escape_sequence: false, + finished: false, } } @@ -45,6 +51,11 @@ impl<'b> StreamContentBuilder<'b> { &mut self.stream_buffer } + /// Set the finished state (called by StreamParser when input is exhausted) + pub fn set_finished(&mut self, finished: bool) { + self.finished = finished; + } + /// Apply queued unescaped content reset if flag is set pub fn apply_unescaped_reset_if_queued(&mut self) { if self.unescaped_reset_queued { @@ -90,9 +101,7 @@ impl<'b> StreamContentBuilder<'b> { let current_pos = self.stream_buffer.current_position(); log::debug!( - "[NEW] start_escape_processing: start_pos={}, current_pos={}", - start_pos, - current_pos + "[NEW] start_escape_processing: start_pos={start_pos}, current_pos={current_pos}" ); // start_pos already points to content start position (not quote position) @@ -129,13 +138,16 @@ impl<'b> StreamContentBuilder<'b> { } } -impl<'b> ContentBuilder for StreamContentBuilder<'b> { +impl ContentBuilder for StreamContentBuilder<'_> { fn begin_content(&mut self, _pos: usize, _is_key: bool) { // StreamParser doesn't need explicit content begin processing // as it handles content accumulation automatically } fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) @@ -156,46 +168,11 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); if in_string_mode { - // CRITICAL FIX: If we're inside a Unicode escape sequence, route hex digits to the collector - if self.in_unicode_escape { - log::debug!( - "[NEW] Unicode escape hex digit: {:02x} ('{}')", - byte, - byte as char - ); - - // Try to add the hex digit to the collector - let is_complete = self.unicode_escape_collector.add_hex_digit(byte)?; - - if is_complete { - log::debug!("[NEW] Unicode escape sequence complete, processing to UTF-8"); - - // Process the complete sequence to UTF-8 - let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, _) = self - .unicode_escape_collector - .process_to_utf8(&mut utf8_buf)?; - - // Write UTF-8 bytes to escape buffer if we have them - if let Some(utf8_bytes) = utf8_bytes_opt { - for &utf8_byte in utf8_bytes { - self.stream_buffer - .append_unescaped_byte(utf8_byte) - .map_err(ParseError::from)?; - } - } - - // Clear the Unicode escape state - we'll get End(UnicodeEscape) event next - self.in_unicode_escape = false; - } - - return Ok(()); - } - - // Normal literal byte processing (not inside Unicode escape) - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if self.stream_buffer.has_unescaped_content() + // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { self.stream_buffer @@ -209,6 +186,7 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] begin_escape_sequence() called"); + self.in_escape_sequence = true; self.start_escape_processing() } @@ -240,8 +218,7 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { // Use shared number parsing with StreamParser-specific document end detection // StreamParser uses state-based detection: finished flag indicates true document end let at_document_end = finished; - log::debug!("[NEW] extract_number: start_pos={}, from_container_end={}, finished={}, at_document_end={}", - start_pos, from_container_end, finished, at_document_end); + log::debug!("[NEW] extract_number: start_pos={start_pos}, from_container_end={from_container_end}, finished={finished}, at_document_end={at_document_end}"); crate::number_parser::parse_number_with_delimiter_logic( &self.stream_buffer, start_pos, @@ -259,7 +236,7 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { } } -impl<'b> ContentExtractor for StreamContentBuilder<'b> { +impl ContentExtractor for StreamContentBuilder<'_> { fn parser_state_mut(&mut self) -> &mut State { &mut self.parser_state } @@ -303,12 +280,11 @@ impl<'b> ContentExtractor for StreamContentBuilder<'b> { start_pos: usize, from_container_end: bool, ) -> Result, ParseError> { - // LEGACY METHOD: This should be updated to use the new ContentBuilder::extract_number - // For now, use conservative approach - assume not finished for ContentExtractor calls - let finished = false; // Conservative: assume not at document end for legacy calls - let at_document_end = finished; - log::debug!("[NEW] extract_number_content LEGACY: start_pos={}, from_container_end={}, assuming finished={}, at_document_end={}", - start_pos, from_container_end, finished, at_document_end); + // Use shared number parsing with StreamParser-specific document end detection + // StreamParser uses state-based detection: finished flag indicates true document end + let at_document_end = self.finished; + log::debug!("[NEW] extract_number_content: start_pos={}, from_container_end={}, finished={}, at_document_end={}", + start_pos, from_container_end, self.finished, at_document_end); crate::number_parser::parse_number_with_delimiter_logic( &self.stream_buffer, start_pos, @@ -318,12 +294,36 @@ impl<'b> ContentExtractor for StreamContentBuilder<'b> { } } -impl<'b> EscapeHandler for StreamContentBuilder<'b> { +impl crate::shared::ByteProvider for StreamContentBuilder<'_> { + fn next_byte(&mut self) -> Result, crate::ParseError> { + // If buffer is empty, cannot provide bytes (let StreamParser handle filling) + if self.stream_buffer.is_empty() { + return Ok(None); + } + + // Get byte and advance + let byte = self.stream_buffer.current_byte()?; + self.stream_buffer.advance()?; + Ok(Some(byte)) + } +} + +impl EscapeHandler for StreamContentBuilder<'_> { fn parser_state(&self) -> &State { &self.parser_state } + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { + // Called when Begin(UnicodeEscape) is received + self.in_unicode_escape = true; + self.in_escape_sequence = true; + Ok(()) + } + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Reset the escape flags + self.in_unicode_escape = false; + self.in_escape_sequence = false; // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts let utf8_bytes_result = { let current_pos = self.stream_buffer.current_position(); @@ -341,7 +341,6 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { hex_slice_provider, &mut utf8_buf, )?; - // Copy UTF-8 bytes to avoid borrow conflicts utf8_bytes_opt.map(|bytes| { let mut copy = [0u8; 4]; @@ -370,6 +369,8 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { } fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) @@ -380,9 +381,11 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); if in_string_mode { - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if self.stream_buffer.has_unescaped_content() + // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { self.stream_buffer @@ -396,6 +399,7 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] EscapeHandler::begin_escape_sequence() called"); + self.in_escape_sequence = true; // Delegate to ContentBuilder implementation ContentBuilder::begin_escape_sequence(self) } diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index 7a52410..5fd910e 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -84,8 +84,11 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> { impl StreamParser<'_, R, C> { /// Get the next JSON event from the stream fn next_event_impl(&mut self) -> Result, ParseError> { - // We need to implement the unified loop locally to avoid borrowing conflicts - // This is essentially a copy of ParserCore::next_event_impl but accessing fields directly + // StreamParser has special requirements that prevent using ParserCore directly: + // 1. Byte accumulation logic when no events are generated + // 2. Buffer filling and compaction logic + // 3. Complex state management across buffer boundaries + // For now, use a custom loop that handles these StreamParser-specific needs loop { while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { if let Some(byte) = self.next_byte()? { @@ -105,7 +108,7 @@ impl StreamParser<'_, R, C> { let has_events = crate::event_processor::have_events(&self.parser_core.parser_state.evts); - log::trace!("[NEW] After tokenizer: has_events={}", has_events); + log::trace!("[NEW] After tokenizer: has_events={has_events}"); // Handle byte accumulation if no event was generated (StreamParser-specific) if !has_events { @@ -116,6 +119,7 @@ impl StreamParser<'_, R, C> { // Handle end of data with tokenizer finish if !self.finished { self.finished = true; + self.content_builder.set_finished(true); crate::event_processor::finish_tokenizer( &mut self.parser_core.tokenizer, &mut self.parser_core.parser_state.evts, @@ -375,7 +379,7 @@ impl StreamParser<'_, R, C> { } } -impl<'b, R: Reader, C: BitStackConfig> PullParser for StreamParser<'b, R, C> { +impl PullParser for StreamParser<'_, R, C> { fn next_event(&mut self) -> Result, ParseError> { self.content_builder.apply_unescaped_reset_if_queued(); @@ -383,7 +387,7 @@ impl<'b, R: Reader, C: BitStackConfig> PullParser for StreamParser<'b, R, C> { } } -impl<'b, R: Reader, C: BitStackConfig> crate::shared::ByteProvider for StreamParser<'b, R, C> { +impl crate::shared::ByteProvider for StreamParser<'_, R, C> { fn next_byte(&mut self) -> Result, ParseError> { // If buffer is empty, try to fill it first if self.content_builder.stream_buffer().is_empty() { From e10bec29d14fde42151694578bf107b580d26090 Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Mon, 14 Jul 2025 20:07:33 -0700 Subject: [PATCH 05/10] Bigger refactor --- picojson/build.rs | 9 +- picojson/src/chunk_reader.rs | 2 +- picojson/src/content_builder.rs | 1 + picojson/src/event_processor.rs | 16 +++- picojson/src/json_number.rs | 4 +- picojson/src/number_parser.rs | 4 +- picojson/src/parse_error.rs | 6 +- picojson/src/parser_core.rs | 71 +++++++-------- picojson/src/shared.rs | 2 - picojson/src/slice_content_builder.rs | 72 ++++++++++----- picojson/src/slice_parser.rs | 117 ++---------------------- picojson/src/stream_content_builder.rs | 120 +++++++++++++------------ picojson/src/stream_parser.rs | 14 +-- 13 files changed, 187 insertions(+), 251 deletions(-) diff --git a/picojson/build.rs b/picojson/build.rs index 03506da..20bfee3 100644 --- a/picojson/build.rs +++ b/picojson/build.rs @@ -79,8 +79,9 @@ fn generate_conformance_tests() -> Result<(), Box> { let generated_code = format!( r#"// Generated by build.rs - DO NOT EDIT +#![allow(non_snake_case)] // Keep test names discoverable and matching original JSON test suite + #[cfg(feature = "remote-tests")] -#[allow(clippy::non_snake_case)] // Keep test names discoverable and matching original JSON test suite mod conformance_generated {{ use picojson::{{Event, ParseError, PullParser, SliceParser}}; @@ -129,6 +130,7 @@ mod conformance_generated {{ Ok(()) } +#[allow(dead_code)] fn sanitize_test_name( filename: &str, test_name_counts: &mut std::collections::HashMap, @@ -156,23 +158,26 @@ fn sanitize_test_name( *count += 1; if *count > 1 { - format!("{}_{}", result, count) + format!("{result}_{count}") } else { result } } // Configuration from Cargo.toml metadata +#[allow(dead_code)] fn get_jsontest_suite_url() -> String { std::env::var("CARGO_PKG_METADATA_CONFORMANCE_TESTS_JSONTEST_SUITE_URL") .unwrap_or_else(|_| "https://github.com/nst/JSONTestSuite/archive/{commit}.zip".to_string()) } +#[allow(dead_code)] fn get_jsontest_suite_commit() -> String { std::env::var("CARGO_PKG_METADATA_CONFORMANCE_TESTS_JSONTEST_SUITE_COMMIT") .unwrap_or_else(|_| "1ef36fa01286573e846ac449e8683f8833c5b26a".to_string()) } +#[allow(dead_code)] fn get_json_checker_url() -> String { std::env::var("CARGO_PKG_METADATA_CONFORMANCE_TESTS_JSON_CHECKER_URL") .unwrap_or_else(|_| "https://www.json.org/JSON_checker/test.zip".to_string()) diff --git a/picojson/src/chunk_reader.rs b/picojson/src/chunk_reader.rs index a72892b..d09200e 100644 --- a/picojson/src/chunk_reader.rs +++ b/picojson/src/chunk_reader.rs @@ -119,7 +119,7 @@ impl<'a> ChunkReader<'a> { } } -impl<'a> Reader for ChunkReader<'a> { +impl Reader for ChunkReader<'_> { type Error = (); fn read(&mut self, buf: &mut [u8]) -> Result { diff --git a/picojson/src/content_builder.rs b/picojson/src/content_builder.rs index 4be88c4..4698732 100644 --- a/picojson/src/content_builder.rs +++ b/picojson/src/content_builder.rs @@ -14,6 +14,7 @@ use crate::{Event, ParseError, String}; /// This trait combines the functionality of `ContentExtractor` and `EscapeHandler` into a /// unified interface that can be implemented by different parser backends while sharing /// the core event processing logic. +#[allow(dead_code)] // Methods are part of trait interface design pub trait ContentBuilder: ContentExtractor + EscapeHandler { /// Begin processing a new string or key at the given position /// diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 85a2a87..fe1a23b 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -9,6 +9,7 @@ use crate::ujson::EventToken; use crate::{Event, ParseError}; /// Escape handling trait for abstracting escape sequence processing between parsers +#[allow(dead_code)] // Methods are part of trait interface design pub trait EscapeHandler { /// Get the current parser state for escape context checking fn parser_state(&self) -> &crate::shared::State; @@ -30,6 +31,12 @@ pub trait EscapeHandler { fn append_literal_byte(&mut self, _byte: u8) -> Result<(), crate::ParseError> { Ok(()) } + + /// Begin unicode escape sequence processing + /// Default implementation is no-op - suitable for parsers that don't need special handling + fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { + Ok(()) + } } /// Result of processing a tokenizer event @@ -65,7 +72,7 @@ pub fn process_begin_events( } crate::ujson::Event::Begin(EventToken::String) => { let pos = content_extractor.current_position(); - log::debug!("[NEW] Begin(String): storing pos={} in State::String", pos); + log::debug!("[NEW] Begin(String): storing pos={pos} in State::String"); *content_extractor.parser_state_mut() = State::String(pos); content_extractor.begin_string_content(pos); Some(EventResult::Continue) @@ -190,9 +197,9 @@ pub trait ContentExtractor: EscapeHandler { /// /// This callback stores tokenizer events in the parser's event array, filling the first /// available slot. This pattern is identical across both SliceParser and StreamParser. -pub fn create_tokenizer_callback<'a>( - event_storage: &'a mut [Option; 2], -) -> impl FnMut(crate::ujson::Event, usize) + 'a { +pub fn create_tokenizer_callback( + event_storage: &mut [Option; 2], +) -> impl FnMut(crate::ujson::Event, usize) + '_ { |event, _len| { for evt in event_storage.iter_mut() { if evt.is_none() { @@ -272,6 +279,7 @@ pub fn process_unicode_escape_events( match content_extractor.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { content_extractor.unicode_escape_collector_mut().reset(); + content_extractor.begin_unicode_escape()?; } _ => {} // Ignore if not in string/key context } diff --git a/picojson/src/json_number.rs b/picojson/src/json_number.rs index dcd024d..152b2ab 100644 --- a/picojson/src/json_number.rs +++ b/picojson/src/json_number.rs @@ -143,9 +143,9 @@ impl core::fmt::Display for JsonNumber<'_, '_> { JsonNumber::Copied { raw, parsed } => (raw, parsed), }; match parsed { - NumberResult::Integer(val) => write!(f, "{}", val), + NumberResult::Integer(val) => write!(f, "{val}"), #[cfg(feature = "float")] - NumberResult::Float(val) => write!(f, "{}", val), + NumberResult::Float(val) => write!(f, "{val}"), #[cfg(all(not(feature = "float"), feature = "float-truncate"))] NumberResult::FloatTruncated(val) => write!(f, "{}", val), // For overflow, disabled, or skipped cases, show the exact raw string diff --git a/picojson/src/number_parser.rs b/picojson/src/number_parser.rs index daafa0f..83637f7 100644 --- a/picojson/src/number_parser.rs +++ b/picojson/src/number_parser.rs @@ -51,8 +51,7 @@ pub fn parse_number_with_delimiter_logic( let use_full_span = !from_container_end && at_document_end; let end_pos = crate::shared::ContentRange::number_end_position(current_pos, use_full_span); - log::debug!("[NEW] parse_number_with_delimiter_logic: start_pos={}, current_pos={}, from_container_end={}, at_document_end={}, use_full_span={}, end_pos={}", - start_pos, current_pos, from_container_end, at_document_end, use_full_span, end_pos); + log::debug!("[NEW] parse_number_with_delimiter_logic: start_pos={start_pos}, current_pos={current_pos}, from_container_end={from_container_end}, at_document_end={at_document_end}, use_full_span={use_full_span}, end_pos={end_pos}"); parse_number_event(extractor, start_pos, end_pos) } @@ -64,6 +63,7 @@ pub fn parse_number_with_delimiter_logic( /// 2. Convert to UTF-8 string /// 3. Parse using shared number parsing logic /// 4. Create JsonNumber::Borrowed event +/// /// All position logic is handled by the calling parser. pub fn parse_number_event( extractor: &T, diff --git a/picojson/src/parse_error.rs b/picojson/src/parse_error.rs index 0a004c9..15801e5 100644 --- a/picojson/src/parse_error.rs +++ b/picojson/src/parse_error.rs @@ -76,9 +76,9 @@ impl From for ParseError { impl core::fmt::Display for ParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { - ParseError::TokenizerError(e) => write!(f, "{}", e), - ParseError::InvalidUtf8(e) => write!(f, "Invalid UTF-8: {}", e), - _ => write!(f, "{:?}", self), + ParseError::TokenizerError(e) => write!(f, "{e}"), + ParseError::InvalidUtf8(e) => write!(f, "Invalid UTF-8: {e}"), + _ => write!(f, "{self:?}"), } } } diff --git a/picojson/src/parser_core.rs b/picojson/src/parser_core.rs index 2149579..7b3f914 100644 --- a/picojson/src/parser_core.rs +++ b/picojson/src/parser_core.rs @@ -16,6 +16,10 @@ use crate::shared::{ByteProvider, Event, ParserState, UnexpectedState}; use crate::ujson::{EventToken, Tokenizer}; use crate::{ujson, ParseError}; +/// Combined trait for parsers that provide both byte access and content building +pub trait ParserProvider: ByteProvider + ContentBuilder {} +impl ParserProvider for T {} + /// The core parser logic that handles the unified event processing loop. /// /// This struct contains all the shared state and logic that was previously @@ -37,25 +41,30 @@ impl ParserCore { } } - /// The unified event processing loop that was previously duplicated - /// between SliceParser and StreamParser. - /// - /// This method implements the common logic while delegating to traits - /// for the parser-specific differences. - pub fn next_event_impl<'a, B, CB>( + /// Unified implementation that works with a single combined provider. + /// This avoids borrowing conflicts by using a single object that implements both traits. + pub fn next_event_impl_unified<'a, P>( &mut self, - byte_provider: &mut B, - content_builder: &'a mut CB, + provider: &'a mut P, escape_timing: EscapeTiming, ) -> Result, ParseError> where - B: ByteProvider, - CB: ContentBuilder, + P: ParserProvider, { loop { while !have_events(&self.parser_state.evts) { - if !self.pull_tokenizer_events(byte_provider)? { - return Ok(Event::EndDocument); + if let Some(byte) = provider.next_byte()? { + process_byte_through_tokenizer( + byte, + &mut self.tokenizer, + &mut self.parser_state.evts, + )?; + } else { + finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; + + if !have_events(&self.parser_state.evts) { + return Ok(Event::EndDocument); + } } } @@ -65,17 +74,15 @@ impl ParserCore { }; // Try shared event processors first - if let Some(result) = process_simple_events(&taken) - .or_else(|| process_begin_events(&taken, content_builder)) + if let Some(result) = + process_simple_events(&taken).or_else(|| process_begin_events(&taken, provider)) { match result { EventResult::Complete(event) => return Ok(event), - EventResult::ExtractString => { - return content_builder.validate_and_extract_string() - } - EventResult::ExtractKey => return content_builder.validate_and_extract_key(), + EventResult::ExtractString => return provider.validate_and_extract_string(), + EventResult::ExtractKey => return provider.validate_and_extract_key(), EventResult::ExtractNumber(from_container_end) => { - return content_builder.validate_and_extract_number(from_container_end) + return provider.validate_and_extract_number(from_container_end) } EventResult::Continue => continue, } @@ -84,9 +91,9 @@ impl ParserCore { // Handle parser-specific events based on escape timing match taken { ujson::Event::Begin(EventToken::EscapeSequence) => { - process_begin_escape_sequence_event(content_builder)?; + process_begin_escape_sequence_event(provider)?; } - _ if process_unicode_escape_events(&taken, content_builder)? => { + _ if process_unicode_escape_events(&taken, provider)? => { // Unicode escape events handled by shared function } ujson::Event::Begin( @@ -102,7 +109,7 @@ impl ParserCore { // SliceParser-specific: Handle simple escape sequences on Begin events // because CopyOnEscape requires starting unescaping immediately when // the escape token begins to maintain zero-copy optimization - process_simple_escape_event(&escape_token, content_builder)?; + process_simple_escape_event(&escape_token, provider)?; } ujson::Event::End( escape_token @ (EventToken::EscapeQuote @@ -117,7 +124,7 @@ impl ParserCore { // StreamParser-specific: Handle simple escape sequences on End events // because StreamBuffer must wait until the token ends to accumulate // all bytes before processing the complete escape sequence - process_simple_escape_event(&escape_token, content_builder)?; + process_simple_escape_event(&escape_token, provider)?; } _ => { // All other events continue to next iteration @@ -125,24 +132,6 @@ impl ParserCore { } } } - - /// Pull events from tokenizer and return whether parsing should continue. - /// This implements the common tokenizer event pulling logic. - fn pull_tokenizer_events( - &mut self, - byte_provider: &mut B, - ) -> Result { - if let Some(byte) = byte_provider.next_byte()? { - process_byte_through_tokenizer(byte, &mut self.tokenizer, &mut self.parser_state.evts)?; - } else { - finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; - - if !have_events(&self.parser_state.evts) { - return Ok(false); // Signal end of parsing - } - } - Ok(true) // Continue parsing - } } /// Enum to specify when escape sequences should be processed diff --git a/picojson/src/shared.rs b/picojson/src/shared.rs index 58befe2..74de2d1 100644 --- a/picojson/src/shared.rs +++ b/picojson/src/shared.rs @@ -55,14 +55,12 @@ pub enum State { /// Parser state and event storage pub(super) struct ParserState { - pub state: State, pub evts: [Option; 2], } impl ParserState { pub fn new() -> Self { Self { - state: State::None, evts: core::array::from_fn(|_| None), } } diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs index da60145..ac3033a 100644 --- a/picojson/src/slice_content_builder.rs +++ b/picojson/src/slice_content_builder.rs @@ -21,6 +21,8 @@ pub struct SliceContentBuilder<'a, 'b> { parser_state: State, /// Unicode escape collector for \uXXXX sequences unicode_escape_collector: UnicodeEscapeCollector, + /// Flag to track when we're inside ANY escape sequence (like stream implementation) + in_escape_sequence: bool, } impl<'a, 'b> SliceContentBuilder<'a, 'b> { @@ -31,6 +33,7 @@ impl<'a, 'b> SliceContentBuilder<'a, 'b> { copy_on_escape: CopyOnEscape::new(input, scratch_buffer), parser_state: State::None, unicode_escape_collector: UnicodeEscapeCollector::new(), + in_escape_sequence: false, } } @@ -45,13 +48,15 @@ impl<'a, 'b> SliceContentBuilder<'a, 'b> { } } -impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { +impl ContentBuilder for SliceContentBuilder<'_, '_> { fn begin_content(&mut self, pos: usize, _is_key: bool) { // SliceParser uses CopyOnEscape for content management self.copy_on_escape.begin_string(pos); } fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) @@ -62,6 +67,8 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { // The position calculation matches the existing SliceParser logic let current_pos = self.buffer.current_pos(); let (_, _, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos); + // Fix: escape_start_pos should point to the backslash, not the 'u' + let actual_escape_start_pos = escape_start_pos.saturating_sub(1); // Check if this is completing a surrogate pair let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); @@ -70,14 +77,14 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; + .handle_unicode_escape(actual_escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; } else { // Single Unicode escape - normal processing self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; } Ok(()) @@ -90,7 +97,8 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { } fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { - // SliceParser doesn't need special escape sequence setup + // Set escape flag to prevent literal byte accumulation during escape processing + self.in_escape_sequence = true; Ok(()) } @@ -119,12 +127,11 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { let at_document_end = self.buffer.is_empty(); log::debug!("[NEW] SliceContentBuilder extract_number: start_pos={}, from_container_end={}, finished={} (ignored), buffer_empty={}, at_document_end={}", start_pos, from_container_end, finished, self.buffer.is_empty(), at_document_end); - crate::number_parser::parse_number_with_delimiter_logic( - &self.buffer, - start_pos, - from_container_end, - at_document_end, - ) + // SliceParser-specific fix: at document end, current_pos points past the input, + // so we need to adjust the logic to always exclude the delimiter position + let current_pos = self.buffer.current_position(); + let end_pos = current_pos.saturating_sub(1); + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) } fn current_position(&self) -> usize { @@ -136,7 +143,7 @@ impl<'a, 'b> ContentBuilder for SliceContentBuilder<'a, 'b> { } } -impl<'a, 'b> ContentExtractor for SliceContentBuilder<'a, 'b> { +impl ContentExtractor for SliceContentBuilder<'_, '_> { fn parser_state_mut(&mut self) -> &mut crate::shared::State { &mut self.parser_state } @@ -176,22 +183,38 @@ impl<'a, 'b> ContentExtractor for SliceContentBuilder<'a, 'b> { start_pos: usize, from_container_end: bool, ) -> Result, ParseError> { + // Use shared number parsing with SliceParser-specific document end detection + // SliceParser uses buffer-based detection: buffer empty indicates document end let at_document_end = self.buffer.is_empty(); - crate::number_parser::parse_number_with_delimiter_logic( - &self.buffer, - start_pos, - from_container_end, - at_document_end, - ) + log::debug!("[NEW] SliceContentBuilder extract_number_content: start_pos={}, from_container_end={}, buffer_empty={}, at_document_end={}", + start_pos, from_container_end, self.buffer.is_empty(), at_document_end); + // SliceParser-specific fix: at document end, current_pos points past the input, + // so we need to adjust the logic to always exclude the delimiter position + let current_pos = self.buffer.current_position(); + let end_pos = current_pos.saturating_sub(1); + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) + } +} + +impl crate::shared::ByteProvider for SliceContentBuilder<'_, '_> { + fn next_byte(&mut self) -> Result, crate::ParseError> { + use crate::slice_input_buffer::InputBuffer; + match self.buffer_mut().consume_byte() { + Ok(byte) => Ok(Some(byte)), + Err(crate::slice_input_buffer::Error::ReachedEnd) => Ok(None), + Err(err) => Err(err.into()), + } } } -impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { +impl EscapeHandler for SliceContentBuilder<'_, '_> { fn parser_state(&self) -> &crate::shared::State { &self.parser_state } fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Clear the escape sequence flag when unicode escape completes + self.in_escape_sequence = false; let current_pos = self.buffer.current_pos(); let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); @@ -206,6 +229,9 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { hex_slice_provider, &mut utf8_buf, )?; + // Fix: escape_start_pos should point to the backslash, not the 'u' + // But don't subtract if it's already pointing to the backslash + let actual_escape_start_pos = escape_start_pos; // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) if let Some(utf8_bytes) = utf8_bytes_opt { @@ -213,14 +239,14 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape - .handle_unicode_escape(escape_start_pos, &[])?; + .handle_unicode_escape(actual_escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 self.copy_on_escape - .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; } else { // Single Unicode escape - normal processing self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; } } @@ -228,6 +254,8 @@ impl<'a, 'b> EscapeHandler for SliceContentBuilder<'a, 'b> { } fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; Ok(()) diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index 7bc29e3..89bcba1 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -1,9 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 -use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::parse_error::ParseError; use crate::parser_core::ParserCore; -use crate::shared::{ByteProvider, Event, PullParser}; +use crate::shared::{Event, PullParser}; use crate::slice_content_builder::SliceContentBuilder; use crate::slice_input_buffer::InputBuffer; use crate::ujson; @@ -146,115 +145,15 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { /// Returns the next JSON event or an error if parsing fails. /// Parsing continues until `EndDocument` is returned or an error occurs. fn next_event_impl(&mut self) -> Result, ParseError> { - // We need to implement the unified loop locally to avoid borrowing conflicts - // This is essentially a copy of ParserCore::next_event_impl but accessing fields directly - loop { - while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { - if let Some(byte) = self.next_byte()? { - crate::event_processor::process_byte_through_tokenizer( - byte, - &mut self.parser_core.tokenizer, - &mut self.parser_core.parser_state.evts, - )?; - } else { - crate::event_processor::finish_tokenizer( - &mut self.parser_core.tokenizer, - &mut self.parser_core.parser_state.evts, - )?; - - if !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { - return Ok(Event::EndDocument); - } - } - } - - let taken_event = - crate::event_processor::take_first_event(&mut self.parser_core.parser_state.evts); - let Some(taken) = taken_event else { - return Err(crate::shared::UnexpectedState::StateMismatch.into()); - }; - - // Try shared event processors first - if let Some(result) = - crate::event_processor::process_simple_events(&taken).or_else(|| { - crate::event_processor::process_begin_events(&taken, &mut self.content_builder) - }) - { - match result { - crate::event_processor::EventResult::Complete(event) => return Ok(event), - crate::event_processor::EventResult::ExtractString => { - return self.content_builder.validate_and_extract_string() - } - crate::event_processor::EventResult::ExtractKey => { - return self.content_builder.validate_and_extract_key() - } - crate::event_processor::EventResult::ExtractNumber(from_container_end) => { - return self.extract_number_with_finished_state(from_container_end) - } - crate::event_processor::EventResult::Continue => continue, - } - } - - // Handle parser-specific events (SliceParser uses OnBegin timing) - match taken { - ujson::Event::Begin(crate::ujson::EventToken::EscapeSequence) => { - crate::event_processor::process_begin_escape_sequence_event( - &mut self.content_builder, - )?; - } - _ if crate::event_processor::process_unicode_escape_events( - &taken, - &mut self.content_builder, - )? => - { - // Unicode escape events handled by shared function - } - ujson::Event::Begin( - escape_token @ (crate::ujson::EventToken::EscapeQuote - | crate::ujson::EventToken::EscapeBackslash - | crate::ujson::EventToken::EscapeSlash - | crate::ujson::EventToken::EscapeBackspace - | crate::ujson::EventToken::EscapeFormFeed - | crate::ujson::EventToken::EscapeNewline - | crate::ujson::EventToken::EscapeCarriageReturn - | crate::ujson::EventToken::EscapeTab), - ) => { - // SliceParser-specific: Handle simple escape sequences on Begin events - // because CopyOnEscape requires starting unescaping immediately when - // the escape token begins to maintain zero-copy optimization - crate::event_processor::process_simple_escape_event( - &escape_token, - &mut self.content_builder, - )?; - } - _ => { - // All other events continue to next iteration - } - } - } - } - - /// Extract number with proper SliceParser document end detection - fn extract_number_with_finished_state( - &mut self, - from_container_end: bool, - ) -> Result, ParseError> { - let start_pos = match *self.content_builder.parser_state() { - crate::shared::State::Number(pos) => pos, - _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()), - }; - - *self.content_builder.parser_state_mut() = crate::shared::State::None; - - // SliceParser doesn't use the finished parameter - it uses buffer emptiness - // Pass finished=false since SliceParser handles document end differently - use crate::content_builder::ContentBuilder; - self.content_builder - .extract_number(start_pos, from_container_end, false) + // Use the unified ParserCore implementation with SliceParser-specific timing + self.parser_core.next_event_impl_unified( + &mut self.content_builder, + crate::parser_core::EscapeTiming::OnBegin, + ) } } -impl<'a, 'b, C: BitStackConfig> PullParser for SliceParser<'a, 'b, C> { +impl PullParser for SliceParser<'_, '_, C> { fn next_event(&mut self) -> Result, ParseError> { if self.content_builder.buffer().is_past_end() { return Ok(Event::EndDocument); @@ -263,7 +162,7 @@ impl<'a, 'b, C: BitStackConfig> PullParser for SliceParser<'a, 'b, C> { } } -impl<'a, 'b, C: BitStackConfig> crate::shared::ByteProvider for SliceParser<'a, 'b, C> { +impl crate::shared::ByteProvider for SliceParser<'_, '_, C> { fn next_byte(&mut self) -> Result, ParseError> { use crate::slice_input_buffer::InputBuffer; match self.content_builder.buffer_mut().consume_byte() { diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index 3b4ef7e..0f891ad 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -21,6 +21,10 @@ pub struct StreamContentBuilder<'b> { unescaped_reset_queued: bool, /// Flag to track when we're inside a Unicode escape sequence (collecting hex digits) in_unicode_escape: bool, + /// Flag to track when we're inside ANY escape sequence (like old implementation) + in_escape_sequence: bool, + /// Flag to track when the input stream has been finished (for number parsing) + finished: bool, } impl<'b> StreamContentBuilder<'b> { @@ -32,6 +36,8 @@ impl<'b> StreamContentBuilder<'b> { unicode_escape_collector: UnicodeEscapeCollector::new(), unescaped_reset_queued: false, in_unicode_escape: false, + in_escape_sequence: false, + finished: false, } } @@ -45,6 +51,11 @@ impl<'b> StreamContentBuilder<'b> { &mut self.stream_buffer } + /// Set the finished state (called by StreamParser when input is exhausted) + pub fn set_finished(&mut self, finished: bool) { + self.finished = finished; + } + /// Apply queued unescaped content reset if flag is set pub fn apply_unescaped_reset_if_queued(&mut self) { if self.unescaped_reset_queued { @@ -90,9 +101,7 @@ impl<'b> StreamContentBuilder<'b> { let current_pos = self.stream_buffer.current_position(); log::debug!( - "[NEW] start_escape_processing: start_pos={}, current_pos={}", - start_pos, - current_pos + "[NEW] start_escape_processing: start_pos={start_pos}, current_pos={current_pos}" ); // start_pos already points to content start position (not quote position) @@ -129,13 +138,16 @@ impl<'b> StreamContentBuilder<'b> { } } -impl<'b> ContentBuilder for StreamContentBuilder<'b> { +impl ContentBuilder for StreamContentBuilder<'_> { fn begin_content(&mut self, _pos: usize, _is_key: bool) { // StreamParser doesn't need explicit content begin processing // as it handles content accumulation automatically } fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; + self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) @@ -156,46 +168,11 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); if in_string_mode { - // CRITICAL FIX: If we're inside a Unicode escape sequence, route hex digits to the collector - if self.in_unicode_escape { - log::debug!( - "[NEW] Unicode escape hex digit: {:02x} ('{}')", - byte, - byte as char - ); - - // Try to add the hex digit to the collector - let is_complete = self.unicode_escape_collector.add_hex_digit(byte)?; - - if is_complete { - log::debug!("[NEW] Unicode escape sequence complete, processing to UTF-8"); - - // Process the complete sequence to UTF-8 - let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, _) = self - .unicode_escape_collector - .process_to_utf8(&mut utf8_buf)?; - - // Write UTF-8 bytes to escape buffer if we have them - if let Some(utf8_bytes) = utf8_bytes_opt { - for &utf8_byte in utf8_bytes { - self.stream_buffer - .append_unescaped_byte(utf8_byte) - .map_err(ParseError::from)?; - } - } - - // Clear the Unicode escape state - we'll get End(UnicodeEscape) event next - self.in_unicode_escape = false; - } - - return Ok(()); - } - - // Normal literal byte processing (not inside Unicode escape) - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if self.stream_buffer.has_unescaped_content() + // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { self.stream_buffer @@ -209,6 +186,7 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] begin_escape_sequence() called"); + self.in_escape_sequence = true; self.start_escape_processing() } @@ -240,8 +218,7 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { // Use shared number parsing with StreamParser-specific document end detection // StreamParser uses state-based detection: finished flag indicates true document end let at_document_end = finished; - log::debug!("[NEW] extract_number: start_pos={}, from_container_end={}, finished={}, at_document_end={}", - start_pos, from_container_end, finished, at_document_end); + log::debug!("[NEW] extract_number: start_pos={start_pos}, from_container_end={from_container_end}, finished={finished}, at_document_end={at_document_end}"); crate::number_parser::parse_number_with_delimiter_logic( &self.stream_buffer, start_pos, @@ -259,7 +236,7 @@ impl<'b> ContentBuilder for StreamContentBuilder<'b> { } } -impl<'b> ContentExtractor for StreamContentBuilder<'b> { +impl ContentExtractor for StreamContentBuilder<'_> { fn parser_state_mut(&mut self) -> &mut State { &mut self.parser_state } @@ -303,12 +280,11 @@ impl<'b> ContentExtractor for StreamContentBuilder<'b> { start_pos: usize, from_container_end: bool, ) -> Result, ParseError> { - // LEGACY METHOD: This should be updated to use the new ContentBuilder::extract_number - // For now, use conservative approach - assume not finished for ContentExtractor calls - let finished = false; // Conservative: assume not at document end for legacy calls - let at_document_end = finished; - log::debug!("[NEW] extract_number_content LEGACY: start_pos={}, from_container_end={}, assuming finished={}, at_document_end={}", - start_pos, from_container_end, finished, at_document_end); + // Use shared number parsing with StreamParser-specific document end detection + // StreamParser uses state-based detection: finished flag indicates true document end + let at_document_end = self.finished; + log::debug!("[NEW] extract_number_content: start_pos={}, from_container_end={}, finished={}, at_document_end={}", + start_pos, from_container_end, self.finished, at_document_end); crate::number_parser::parse_number_with_delimiter_logic( &self.stream_buffer, start_pos, @@ -318,12 +294,36 @@ impl<'b> ContentExtractor for StreamContentBuilder<'b> { } } -impl<'b> EscapeHandler for StreamContentBuilder<'b> { +impl crate::shared::ByteProvider for StreamContentBuilder<'_> { + fn next_byte(&mut self) -> Result, crate::ParseError> { + // If buffer is empty, cannot provide bytes (let StreamParser handle filling) + if self.stream_buffer.is_empty() { + return Ok(None); + } + + // Get byte and advance + let byte = self.stream_buffer.current_byte()?; + self.stream_buffer.advance()?; + Ok(Some(byte)) + } +} + +impl EscapeHandler for StreamContentBuilder<'_> { fn parser_state(&self) -> &State { &self.parser_state } + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { + // Called when Begin(UnicodeEscape) is received + self.in_unicode_escape = true; + self.in_escape_sequence = true; + Ok(()) + } + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Reset the escape flags + self.in_unicode_escape = false; + self.in_escape_sequence = false; // Shared Unicode escape processing pattern - collect UTF-8 bytes first to avoid borrow conflicts let utf8_bytes_result = { let current_pos = self.stream_buffer.current_position(); @@ -341,7 +341,6 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { hex_slice_provider, &mut utf8_buf, )?; - // Copy UTF-8 bytes to avoid borrow conflicts utf8_bytes_opt.map(|bytes| { let mut copy = [0u8; 4]; @@ -370,6 +369,8 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { } fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + // Clear the escape sequence flag when simple escape completes + self.in_escape_sequence = false; self.stream_buffer .append_unescaped_byte(escape_char) .map_err(ParseError::from) @@ -380,9 +381,11 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); if in_string_mode { - // Skip writing bytes to escape buffer when we have a pending high surrogate - // (prevents literal \uD801 text from being included in final string) - if self.stream_buffer.has_unescaped_content() + // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && self.stream_buffer.has_unescaped_content() && !self.unicode_escape_collector.has_pending_high_surrogate() { self.stream_buffer @@ -396,6 +399,7 @@ impl<'b> EscapeHandler for StreamContentBuilder<'b> { fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { log::debug!("[NEW] EscapeHandler::begin_escape_sequence() called"); + self.in_escape_sequence = true; // Delegate to ContentBuilder implementation ContentBuilder::begin_escape_sequence(self) } diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index 7a52410..5fd910e 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -84,8 +84,11 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> { impl StreamParser<'_, R, C> { /// Get the next JSON event from the stream fn next_event_impl(&mut self) -> Result, ParseError> { - // We need to implement the unified loop locally to avoid borrowing conflicts - // This is essentially a copy of ParserCore::next_event_impl but accessing fields directly + // StreamParser has special requirements that prevent using ParserCore directly: + // 1. Byte accumulation logic when no events are generated + // 2. Buffer filling and compaction logic + // 3. Complex state management across buffer boundaries + // For now, use a custom loop that handles these StreamParser-specific needs loop { while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { if let Some(byte) = self.next_byte()? { @@ -105,7 +108,7 @@ impl StreamParser<'_, R, C> { let has_events = crate::event_processor::have_events(&self.parser_core.parser_state.evts); - log::trace!("[NEW] After tokenizer: has_events={}", has_events); + log::trace!("[NEW] After tokenizer: has_events={has_events}"); // Handle byte accumulation if no event was generated (StreamParser-specific) if !has_events { @@ -116,6 +119,7 @@ impl StreamParser<'_, R, C> { // Handle end of data with tokenizer finish if !self.finished { self.finished = true; + self.content_builder.set_finished(true); crate::event_processor::finish_tokenizer( &mut self.parser_core.tokenizer, &mut self.parser_core.parser_state.evts, @@ -375,7 +379,7 @@ impl StreamParser<'_, R, C> { } } -impl<'b, R: Reader, C: BitStackConfig> PullParser for StreamParser<'b, R, C> { +impl PullParser for StreamParser<'_, R, C> { fn next_event(&mut self) -> Result, ParseError> { self.content_builder.apply_unescaped_reset_if_queued(); @@ -383,7 +387,7 @@ impl<'b, R: Reader, C: BitStackConfig> PullParser for StreamParser<'b, R, C> { } } -impl<'b, R: Reader, C: BitStackConfig> crate::shared::ByteProvider for StreamParser<'b, R, C> { +impl crate::shared::ByteProvider for StreamParser<'_, R, C> { fn next_byte(&mut self) -> Result, ParseError> { // If buffer is empty, try to fill it first if self.content_builder.stream_buffer().is_empty() { From 60339b3a41b732d0c52a16b3bda17a00cf5ac70b Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Mon, 14 Jul 2025 21:29:04 -0700 Subject: [PATCH 06/10] Clean refactor near end --- picojson/Cargo.toml | 4 - picojson/src/chunk_reader.rs | 2 +- picojson/src/content_builder.rs | 78 --------- picojson/src/debug_test.rs | 25 --- picojson/src/escape_processor.rs | 19 --- picojson/src/event_processor.rs | 53 +++--- picojson/src/lib.rs | 8 - picojson/src/number_parser.rs | 2 - picojson/src/parser_core.rs | 9 +- picojson/src/simple_debug.rs | 35 ---- picojson/src/slice_content_builder.rs | 149 +++++------------ picojson/src/slice_input_buffer.rs | 5 + picojson/src/stream_buffer.rs | 9 +- picojson/src/stream_content_builder.rs | 175 ++++---------------- picojson/src/stream_parser.rs | 45 +++-- picojson/tests/debug_root_numbers.rs | 140 ---------------- picojson/tests/debug_stress_case.rs | 31 ---- picojson/tests/stream_parser_stress_test.rs | 2 +- 18 files changed, 136 insertions(+), 655 deletions(-) delete mode 100644 picojson/src/content_builder.rs delete mode 100644 picojson/src/debug_test.rs delete mode 100644 picojson/src/simple_debug.rs delete mode 100644 picojson/tests/debug_root_numbers.rs delete mode 100644 picojson/tests/debug_stress_case.rs diff --git a/picojson/Cargo.toml b/picojson/Cargo.toml index a4adbe6..fb920a3 100644 --- a/picojson/Cargo.toml +++ b/picojson/Cargo.toml @@ -51,8 +51,4 @@ ureq = { version = "2.0", optional = true } zip = { version = "0.6", optional = true } [dev-dependencies] -test-log = "0.2.18" paste = "1.0" - -[dependencies] -log = "0.4.27" diff --git a/picojson/src/chunk_reader.rs b/picojson/src/chunk_reader.rs index d09200e..638760c 100644 --- a/picojson/src/chunk_reader.rs +++ b/picojson/src/chunk_reader.rs @@ -356,7 +356,7 @@ mod integration_tests { test_delimiter_bug_helper(2, 128); } - #[test_log::test] + #[test] fn test_delimiter_bug_small_chunks_small_buffer() { // Test if larger chunks fix it: 3-byte chunks + 64-byte buffer test_delimiter_bug_helper(3, 64); diff --git a/picojson/src/content_builder.rs b/picojson/src/content_builder.rs deleted file mode 100644 index 4698732..0000000 --- a/picojson/src/content_builder.rs +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -//! Content building and extraction trait for unifying parser content handling. -//! -//! This module provides the `ContentBuilder` trait that consolidates content extraction -//! and escape handling logic between SliceParser and StreamParser, eliminating duplication -//! while preserving each parser's performance characteristics. - -use crate::event_processor::{ContentExtractor, EscapeHandler}; -use crate::{Event, ParseError, String}; - -/// Trait for building and extracting content (strings, keys, numbers) with escape handling. -/// -/// This trait combines the functionality of `ContentExtractor` and `EscapeHandler` into a -/// unified interface that can be implemented by different parser backends while sharing -/// the core event processing logic. -#[allow(dead_code)] // Methods are part of trait interface design -pub trait ContentBuilder: ContentExtractor + EscapeHandler { - /// Begin processing a new string or key at the given position - /// - /// # Arguments - /// * `pos` - Position where the content starts - /// * `is_key` - True if this is a key, false if it's a string value - fn begin_content(&mut self, pos: usize, is_key: bool); - - /// Handle a simple escape character (after EscapeProcessor conversion) - /// - /// # Arguments - /// * `escape_char` - The unescaped character (e.g., b'\n' for "\\n") - fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError>; - - /// Handle a Unicode escape sequence, providing the resulting UTF-8 bytes - /// - /// # Arguments - /// * `utf8_bytes` - The UTF-8 encoded bytes for the Unicode codepoint - fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError>; - - /// Append a literal (non-escape) byte during content accumulation - /// - /// # Arguments - /// * `byte` - The literal byte to append - fn append_literal_byte(&mut self, byte: u8) -> Result<(), ParseError>; - - /// Begin an escape sequence (lifecycle hook) - /// Called when escape sequence processing begins (e.g., on Begin(EscapeSequence)) - fn begin_escape_sequence(&mut self) -> Result<(), ParseError>; - - /// Extract a completed string value - /// - /// # Arguments - /// * `start_pos` - Position where the string content started - fn extract_string(&mut self, start_pos: usize) -> Result, ParseError>; - - /// Extract a completed key - /// - /// # Arguments - /// * `start_pos` - Position where the key content started - fn extract_key(&mut self, start_pos: usize) -> Result, ParseError>; - - /// Extract a completed number using shared number parsing logic - /// - /// # Arguments - /// * `start_pos` - Position where the number started - /// * `from_container_end` - True if number was terminated by container delimiter - /// * `finished` - True if the parser has finished processing input (StreamParser-specific) - fn extract_number( - &mut self, - start_pos: usize, - from_container_end: bool, - finished: bool, - ) -> Result, ParseError>; - - /// Get the current position in the input - fn current_position(&self) -> usize; - - /// Check if input is exhausted (for number delimiter logic) - fn is_exhausted(&self) -> bool; -} diff --git a/picojson/src/debug_test.rs b/picojson/src/debug_test.rs deleted file mode 100644 index d8da888..0000000 --- a/picojson/src/debug_test.rs +++ /dev/null @@ -1,25 +0,0 @@ -#[cfg(test)] -mod debug_position_test { - use crate::{ChunkReader, Event, PullParser, StreamParser}; - - #[test_log::test] - fn debug_simple_escape_position() { - // Very simple case: just track positions - let json = b"\"a\\nb\""; - println!("JSON bytes: {:?}", json); - for (i, &b) in json.iter().enumerate() { - println!(" pos {}: '{}' ({})", i, b as char, b); - } - - let reader = ChunkReader::new(json, 8); - let mut buffer = [0u8; 32]; - let mut parser = StreamParser::<_, crate::ujson::DefaultConfig>::new(reader, &mut buffer); - - if let Event::String(s) = parser.next_event().unwrap() { - println!("Result: {:?}", s.as_str()); - assert_eq!(s.as_str(), "a\nb"); - } else { - panic!("Expected String event"); - } - } -} diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs index 3fef114..2b9e513 100644 --- a/picojson/src/escape_processor.rs +++ b/picojson/src/escape_processor.rs @@ -151,9 +151,6 @@ impl EscapeProcessor { codepoint = (codepoint << 4) | digit; } - log::debug!("NEW process_unicode_escape: hex_slice={:?}, codepoint=0x{:04X}, pending_high_surrogate={:?}", - hex_slice, codepoint, pending_high_surrogate); - // Check if we have a pending high surrogate if let Some(high) = pending_high_surrogate { // We should have a low surrogate now @@ -162,36 +159,23 @@ impl EscapeProcessor { let combined = Self::combine_surrogate_pair(high, codepoint)?; let ch = char::from_u32(combined).ok_or(ParseError::InvalidUnicodeCodepoint)?; let utf8_str = ch.encode_utf8(utf8_buffer); - log::debug!("NEW process_unicode_escape: Surrogate pair combined: high=0x{:04X}, low=0x{:04X}, combined=0x{:04X}, utf8={:?}", - high, codepoint, combined, utf8_str.as_bytes()); Ok((Some(utf8_str.as_bytes()), None)) } else { // Error: high surrogate not followed by low surrogate - log::debug!("NEW process_unicode_escape: Error - high surrogate not followed by low surrogate"); Err(ParseError::InvalidUnicodeCodepoint) } } else { // No pending high surrogate if Self::is_high_surrogate(codepoint) { // Save this high surrogate for the next escape - log::debug!( - "NEW process_unicode_escape: High surrogate saved: 0x{:04X}", - codepoint - ); Ok((None, Some(codepoint))) } else if Self::is_low_surrogate(codepoint) { // Error: low surrogate without preceding high surrogate - log::debug!("NEW process_unicode_escape: Error - low surrogate without preceding high surrogate"); Err(ParseError::InvalidUnicodeCodepoint) } else { // Regular Unicode character let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?; let utf8_str = ch.encode_utf8(utf8_buffer); - log::debug!( - "NEW process_unicode_escape: Regular Unicode character: 0x{:04X} -> {:?}", - codepoint, - utf8_str.as_bytes() - ); Ok((Some(utf8_str.as_bytes()), None)) } } @@ -687,9 +671,6 @@ where // Extract the 4 hex digits from the buffer using the provider let hex_slice = hex_slice_provider(hex_start, hex_end)?; - log::debug!("NEW process_unicode_escape_sequence: current_pos={}, hex_start={}, hex_end={}, escape_start_pos={}, hex_slice={:?}", - current_pos, hex_start, hex_end, escape_start_pos, hex_slice); - if hex_slice.len() != 4 { return Err(UnexpectedState::InvalidUnicodeEscape.into()); } diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 19950a0..84065c1 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -9,7 +9,6 @@ use crate::ujson::EventToken; use crate::{Event, ParseError}; /// Escape handling trait for abstracting escape sequence processing between parsers -#[allow(dead_code)] // Methods are part of trait interface design pub trait EscapeHandler { /// Get the current parser state for escape context checking fn parser_state(&self) -> &crate::shared::State; @@ -26,12 +25,6 @@ pub trait EscapeHandler { Ok(()) } - /// Append a single literal byte (for per-byte accumulation patterns) - /// Default implementation is no-op - suitable for parsers that don't need per-byte processing - fn append_literal_byte(&mut self, _byte: u8) -> Result<(), crate::ParseError> { - Ok(()) - } - /// Begin unicode escape sequence processing /// Default implementation is no-op - suitable for parsers that don't need special handling fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { @@ -72,7 +65,6 @@ pub fn process_begin_events( } crate::ujson::Event::Begin(EventToken::String) => { let pos = content_extractor.current_position(); - log::debug!("[NEW] Begin(String): storing pos={pos} in State::String"); *content_extractor.parser_state_mut() = State::String(pos); content_extractor.begin_string_content(pos); Some(EventResult::Continue) @@ -140,6 +132,19 @@ pub trait ContentExtractor: EscapeHandler { from_container_end: bool, ) -> Result, crate::ParseError>; + /// Extract a completed number using shared number parsing logic + /// + /// # Arguments + /// * `start_pos` - Position where the number started + /// * `from_container_end` - True if number was terminated by container delimiter + /// * `finished` - True if the parser has finished processing input (StreamParser-specific) + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, crate::ParseError>; + /// Shared validation and extraction for string content fn validate_and_extract_string(&mut self) -> Result, crate::ParseError> { let start_pos = match *self.parser_state() { @@ -227,18 +232,11 @@ pub fn process_begin_escape_sequence_event( handler: &mut H, ) -> Result<(), crate::ParseError> { // Only process if we're inside a string or key - log::debug!( - "[NEW] process_begin_escape_sequence_event called, state: {:?}", - handler.parser_state() - ); match handler.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { - log::debug!("[NEW] calling begin_escape_sequence()"); handler.begin_escape_sequence()?; } - _ => { - log::debug!("[NEW] ignoring escape sequence - not in string/key context"); - } // Ignore if not in string/key context + _ => {} // Ignore if not in string/key context } Ok(()) } @@ -274,34 +272,24 @@ pub fn process_unicode_escape_events( ) -> Result { match event { crate::ujson::Event::Begin(EventToken::UnicodeEscape) => { - log::debug!("NEW event_processor: Begin(UnicodeEscape) event received"); // Start Unicode escape collection - reset collector for new sequence // Only handle if we're inside a string or key match content_extractor.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { - log::debug!("NEW event_processor: Resetting unicode escape collector"); content_extractor.unicode_escape_collector_mut().reset(); content_extractor.begin_unicode_escape()?; } - _ => { - log::debug!( - "NEW event_processor: Ignoring unicode escape outside string/key context" - ); - } // Ignore if not in string/key context + _ => {} // Ignore if not in string/key context } Ok(true) // Event was handled } crate::ujson::Event::End(EventToken::UnicodeEscape) => { - log::debug!("NEW event_processor: End(UnicodeEscape) event received"); // Handle end of Unicode escape sequence (\uXXXX) match content_extractor.parser_state() { crate::shared::State::String(_) | crate::shared::State::Key(_) => { - log::debug!("NEW event_processor: Processing unicode escape with collector"); content_extractor.process_unicode_escape_with_collector()?; } - _ => { - log::debug!("NEW event_processor: Ignoring unicode escape end outside string/key context"); - } // Ignore if not in string/key context + _ => {} // Ignore if not in string/key context } Ok(true) // Event was handled } @@ -497,6 +485,15 @@ mod tests { ) -> Result, crate::ParseError> { unimplemented!("Mock doesn't need extraction") } + + fn extract_number( + &mut self, + _start_pos: usize, + _from_container_end: bool, + _finished: bool, + ) -> Result, crate::ParseError> { + unimplemented!("Mock doesn't need extraction") + } } #[test] diff --git a/picojson/src/lib.rs b/picojson/src/lib.rs index 51e78bb..f149eac 100644 --- a/picojson/src/lib.rs +++ b/picojson/src/lib.rs @@ -63,8 +63,6 @@ mod copy_on_escape; mod escape_processor; -mod content_builder; - mod parser_core; mod stream_buffer; @@ -73,12 +71,6 @@ mod stream_content_builder; mod stream_parser; -#[cfg(test)] -mod debug_test; - -#[cfg(test)] -mod simple_debug; - mod slice_content_builder; mod slice_parser; diff --git a/picojson/src/number_parser.rs b/picojson/src/number_parser.rs index 83637f7..29e7d4f 100644 --- a/picojson/src/number_parser.rs +++ b/picojson/src/number_parser.rs @@ -51,8 +51,6 @@ pub fn parse_number_with_delimiter_logic( let use_full_span = !from_container_end && at_document_end; let end_pos = crate::shared::ContentRange::number_end_position(current_pos, use_full_span); - log::debug!("[NEW] parse_number_with_delimiter_logic: start_pos={start_pos}, current_pos={current_pos}, from_container_end={from_container_end}, at_document_end={at_document_end}, use_full_span={use_full_span}, end_pos={end_pos}"); - parse_number_event(extractor, start_pos, end_pos) } diff --git a/picojson/src/parser_core.rs b/picojson/src/parser_core.rs index 7b3f914..8bc3ffb 100644 --- a/picojson/src/parser_core.rs +++ b/picojson/src/parser_core.rs @@ -6,19 +6,18 @@ //! event processing logic between SliceParser and StreamParser, eliminating //! the duplication in their `next_event_impl` methods. -use crate::content_builder::ContentBuilder; use crate::event_processor::{ finish_tokenizer, have_events, process_begin_escape_sequence_event, process_begin_events, process_byte_through_tokenizer, process_simple_escape_event, process_simple_events, - process_unicode_escape_events, take_first_event, EventResult, + process_unicode_escape_events, take_first_event, ContentExtractor, EventResult, }; use crate::shared::{ByteProvider, Event, ParserState, UnexpectedState}; use crate::ujson::{EventToken, Tokenizer}; use crate::{ujson, ParseError}; -/// Combined trait for parsers that provide both byte access and content building -pub trait ParserProvider: ByteProvider + ContentBuilder {} -impl ParserProvider for T {} +/// Combined trait for parsers that provide both byte access and content extraction +pub trait ParserProvider: ByteProvider + ContentExtractor {} +impl ParserProvider for T {} /// The core parser logic that handles the unified event processing loop. /// diff --git a/picojson/src/simple_debug.rs b/picojson/src/simple_debug.rs deleted file mode 100644 index f9b8abd..0000000 --- a/picojson/src/simple_debug.rs +++ /dev/null @@ -1,35 +0,0 @@ -#[cfg(test)] -mod simple_debug { - use crate::chunk_reader::ChunkReader; - use crate::*; - - #[test_log::test] - fn trace_hello_escape() { - // Just trace "hello\n" - first 7 bytes - let json = b"\"hello\\n\""; - println!("=== NEW IMPLEMENTATION ==="); - println!("Input bytes: {:?}", json); - for (i, &b) in json.iter().enumerate() { - println!(" Position {}: '{}' (byte {})", i, b as char, b); - } - - let reader = ChunkReader::new(json, 16); - let mut buffer = [0u8; 64]; - let mut parser = - stream_parser::StreamParser::<_, crate::ujson::DefaultConfig>::new(reader, &mut buffer); - - match parser.next_event() { - Ok(Event::String(s)) => { - println!("SUCCESS: Got string '{}'", s.as_str()); - println!("Expected: 'hello\\n', Got: '{}'", s.as_str()); - if s.as_str() == "hello\n" { - println!("✅ Test PASSED"); - } else { - println!("❌ Test FAILED"); - } - } - Ok(other) => println!("ERROR: Expected String, got {:?}", other), - Err(e) => println!("ERROR: Parse failed: {:?}", e), - } - } -} diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs index ac3033a..db65d5f 100644 --- a/picojson/src/slice_content_builder.rs +++ b/picojson/src/slice_content_builder.rs @@ -2,14 +2,13 @@ //! ContentBuilder implementation for SliceParser using CopyOnEscape optimization. -use crate::content_builder::ContentBuilder; use crate::copy_on_escape::CopyOnEscape; use crate::escape_processor::UnicodeEscapeCollector; use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::number_parser::NumberExtractor; use crate::shared::{ContentRange, State}; use crate::slice_input_buffer::SliceInputBuffer; -use crate::{Event, ParseError, String}; +use crate::ParseError; /// ContentBuilder implementation for SliceParser that uses CopyOnEscape for zero-copy optimization pub struct SliceContentBuilder<'a, 'b> { @@ -48,101 +47,6 @@ impl<'a, 'b> SliceContentBuilder<'a, 'b> { } } -impl ContentBuilder for SliceContentBuilder<'_, '_> { - fn begin_content(&mut self, pos: usize, _is_key: bool) { - // SliceParser uses CopyOnEscape for content management - self.copy_on_escape.begin_string(pos); - } - - fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { - // Clear the escape sequence flag when simple escape completes - self.in_escape_sequence = false; - self.copy_on_escape - .handle_escape(self.buffer.current_pos(), escape_char)?; - Ok(()) - } - - fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError> { - // SliceParser handles Unicode escapes through CopyOnEscape - // The position calculation matches the existing SliceParser logic - let current_pos = self.buffer.current_pos(); - let (_, _, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos); - // Fix: escape_start_pos should point to the backslash, not the 'u' - let actual_escape_start_pos = escape_start_pos.saturating_sub(1); - - // Check if this is completing a surrogate pair - let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); - - if had_pending_high_surrogate { - // This is completing a surrogate pair - need to consume both escapes - // First call: consume the high surrogate (6 bytes earlier) - self.copy_on_escape - .handle_unicode_escape(actual_escape_start_pos, &[])?; - // Second call: consume the low surrogate and write UTF-8 - self.copy_on_escape - .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; - } else { - // Single Unicode escape - normal processing - self.copy_on_escape - .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; - } - - Ok(()) - } - - fn append_literal_byte(&mut self, _byte: u8) -> Result<(), ParseError> { - // SliceParser doesn't typically need per-byte processing since it works with ranges - // This could be implemented as a single-byte range if needed, but for now it's a no-op - Ok(()) - } - - fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { - // Set escape flag to prevent literal byte accumulation during escape processing - self.in_escape_sequence = true; - Ok(()) - } - - fn extract_string(&mut self, _start_pos: usize) -> Result, ParseError> { - // Use CopyOnEscape to get the final string result - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let value_result = self.copy_on_escape.end_string(end_pos)?; - Ok(value_result) - } - - fn extract_key(&mut self, _start_pos: usize) -> Result, ParseError> { - // Use CopyOnEscape to get the final key result - let end_pos = ContentRange::end_position_excluding_delimiter(self.buffer.current_pos()); - let key_result = self.copy_on_escape.end_string(end_pos)?; - Ok(key_result) - } - - fn extract_number( - &mut self, - start_pos: usize, - from_container_end: bool, - finished: bool, - ) -> Result, ParseError> { - // Use shared number parsing with SliceParser-specific document end detection - // SliceParser uses buffer-based detection: buffer empty indicates document end - let at_document_end = self.buffer.is_empty(); - log::debug!("[NEW] SliceContentBuilder extract_number: start_pos={}, from_container_end={}, finished={} (ignored), buffer_empty={}, at_document_end={}", - start_pos, from_container_end, finished, self.buffer.is_empty(), at_document_end); - // SliceParser-specific fix: at document end, current_pos points past the input, - // so we need to adjust the logic to always exclude the delimiter position - let current_pos = self.buffer.current_position(); - let end_pos = current_pos.saturating_sub(1); - crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) - } - - fn current_position(&self) -> usize { - self.buffer.current_pos() - } - - fn is_exhausted(&self) -> bool { - self.buffer.is_empty() - } -} - impl ContentExtractor for SliceContentBuilder<'_, '_> { fn parser_state_mut(&mut self) -> &mut crate::shared::State { &mut self.parser_state @@ -183,15 +87,44 @@ impl ContentExtractor for SliceContentBuilder<'_, '_> { start_pos: usize, from_container_end: bool, ) -> Result, ParseError> { - // Use shared number parsing with SliceParser-specific document end detection - // SliceParser uses buffer-based detection: buffer empty indicates document end + // For standalone numbers at document end, clamp end position to buffer bounds let at_document_end = self.buffer.is_empty(); - log::debug!("[NEW] SliceContentBuilder extract_number_content: start_pos={}, from_container_end={}, buffer_empty={}, at_document_end={}", - start_pos, from_container_end, self.buffer.is_empty(), at_document_end); - // SliceParser-specific fix: at document end, current_pos points past the input, - // so we need to adjust the logic to always exclude the delimiter position - let current_pos = self.buffer.current_position(); - let end_pos = current_pos.saturating_sub(1); + let current_pos = self.buffer.current_pos(); + let use_full_span = !from_container_end && at_document_end; + + let end_pos = if use_full_span { + // Standalone number: clamp to buffer length to prevent slice bounds errors + core::cmp::min(current_pos, self.buffer.data_len()) + } else { + // Container number: exclude delimiter + current_pos.saturating_sub(1) + }; + + crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) + } + + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + // For SliceParser, use buffer-based document end detection + // The finished parameter should always be true for complete slices, but we don't rely on it + let at_document_end = self.buffer.is_empty(); + let current_pos = self.buffer.current_pos(); + let use_full_span = !from_container_end && at_document_end; + + let end_pos = if use_full_span { + // Standalone number: clamp to buffer length to prevent slice bounds errors + core::cmp::min(current_pos, self.buffer.data_len()) + } else { + // Container number: exclude delimiter + current_pos.saturating_sub(1) + }; + + // Note: finished parameter is ignored for SliceParser as slices are always complete + let _ = finished; // Explicitly acknowledge parameter while explaining why it's unused crate::number_parser::parse_number_event(&self.buffer, start_pos, end_pos) } } @@ -261,9 +194,9 @@ impl EscapeHandler for SliceContentBuilder<'_, '_> { Ok(()) } - fn append_literal_byte(&mut self, _byte: u8) -> Result<(), ParseError> { - // SliceParser doesn't typically need per-byte processing since it works with ranges - // This could be implemented as a single-byte range if needed, but for now it's a no-op + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + // Set escape flag to prevent literal byte accumulation during escape processing + self.in_escape_sequence = true; Ok(()) } } diff --git a/picojson/src/slice_input_buffer.rs b/picojson/src/slice_input_buffer.rs index 0b343f4..4a4ee9c 100644 --- a/picojson/src/slice_input_buffer.rs +++ b/picojson/src/slice_input_buffer.rs @@ -54,6 +54,11 @@ impl<'a> SliceInputBuffer<'a> { pub fn slice(&self, start: usize, end: usize) -> Result<&'a [u8], Error> { self.data.get(start..end).ok_or(Error::InvalidSliceBounds) } + + /// Gets the length of the underlying data for bounds checking. + pub fn data_len(&self) -> usize { + self.data.len() + } } impl crate::number_parser::NumberExtractor for SliceInputBuffer<'_> { diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index 23929b0..ead407d 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -1005,13 +1005,6 @@ impl crate::number_parser::NumberExtractor for StreamBuffer<'_> { } fn is_empty(&self) -> bool { - let result = self.tokenize_pos >= self.data_end; - log::debug!( - "[NEW] StreamBuffer::is_empty(): tokenize_pos={}, data_end={}, result={}", - self.tokenize_pos, - self.data_end, - result - ); - result + self.tokenize_pos >= self.data_end } } diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index cf8888a..c3f511d 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -2,7 +2,6 @@ //! ContentBuilder implementation for StreamParser using StreamBuffer. -use crate::content_builder::ContentBuilder; use crate::escape_processor::UnicodeEscapeCollector; use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::shared::{ContentRange, State}; @@ -100,10 +99,6 @@ impl<'b> StreamContentBuilder<'b> { if let State::String(start_pos) | State::Key(start_pos) = self.parser_state { let current_pos = self.stream_buffer.current_position(); - log::debug!( - "[NEW] start_escape_processing: start_pos={start_pos}, current_pos={current_pos}" - ); - // start_pos already points to content start position (not quote position) let content_start = start_pos; // Content to copy ends right before the escape character @@ -122,9 +117,6 @@ impl<'b> StreamContentBuilder<'b> { .checked_add(content_len) .ok_or(ParseError::NumericOverflow)?; - log::debug!("[NEW] start_escape_processing: content_start={}, content_end={}, copying {} bytes", - content_start, content_end, content_end.wrapping_sub(content_start)); - // Start unescaping with StreamBuffer and copy existing content self.stream_buffer.start_unescaping_with_copy( max_escaped_len, @@ -136,113 +128,15 @@ impl<'b> StreamContentBuilder<'b> { Ok(()) } -} - -impl ContentBuilder for StreamContentBuilder<'_> { - fn begin_content(&mut self, _pos: usize, _is_key: bool) { - // StreamParser doesn't need explicit content begin processing - // as it handles content accumulation automatically - } - - fn handle_simple_escape(&mut self, escape_char: u8) -> Result<(), ParseError> { - // Clear the escape sequence flag when simple escape completes - self.in_escape_sequence = false; - - self.stream_buffer - .append_unescaped_byte(escape_char) - .map_err(ParseError::from) - } - - fn handle_unicode_escape(&mut self, utf8_bytes: &[u8]) -> Result<(), ParseError> { - log::debug!( - "STREAM CONTENT BUILDER handle_unicode_escape called with utf8_bytes={:?}", - utf8_bytes - ); - // StreamParser handles all escape sequences the same way - append bytes to escape buffer - for &byte in utf8_bytes { - self.stream_buffer - .append_unescaped_byte(byte) - .map_err(ParseError::from)?; - } - Ok(()) - } - - fn append_literal_byte(&mut self, byte: u8) -> Result<(), ParseError> { - // Check if we're in a string or key state and should accumulate bytes - let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); - - if in_string_mode { - // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer - // when inside ANY escape sequence (in_escape_sequence == true) - // This prevents hex digits from being accumulated as literal text - if !self.in_escape_sequence - && self.stream_buffer.has_unescaped_content() - && !self.unicode_escape_collector.has_pending_high_surrogate() - { - log::debug!("[NEW] Writing literal byte to escape buffer: {:02x}", byte); - self.stream_buffer - .append_unescaped_byte(byte) - .map_err(ParseError::from)?; - } else { - log::debug!("[NEW] Skipping literal byte (in_escape_sequence={}, in_unicode_escape={}, has_unescaped={}, pending_surrogate={})", - self.in_escape_sequence, self.in_unicode_escape, - self.stream_buffer.has_unescaped_content(), - self.unicode_escape_collector.has_pending_high_surrogate()); - } - } - - Ok(()) - } - - fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { - log::debug!("[NEW] begin_escape_sequence() called"); - self.in_escape_sequence = true; - self.start_escape_processing() - } - - fn extract_string(&mut self, start_pos: usize) -> Result, ParseError> { - if self.stream_buffer.has_unescaped_content() { - self.create_unescaped_string() - } else { - self.create_borrowed_string(start_pos) - } - } - - fn extract_key(&mut self, start_pos: usize) -> Result, ParseError> { - if self.stream_buffer.has_unescaped_content() { - self.queue_unescaped_reset(); - let unescaped_slice = self.stream_buffer.get_unescaped_slice()?; - let str_content = crate::shared::from_utf8(unescaped_slice)?; - Ok(String::Unescaped(str_content)) - } else { - self.create_borrowed_string(start_pos) - } - } - - fn extract_number( - &mut self, - start_pos: usize, - from_container_end: bool, - finished: bool, - ) -> Result, ParseError> { - // Use shared number parsing with StreamParser-specific document end detection - // StreamParser uses state-based detection: finished flag indicates true document end - let at_document_end = finished; - log::debug!("[NEW] extract_number: start_pos={start_pos}, from_container_end={from_container_end}, finished={finished}, at_document_end={at_document_end}"); - crate::number_parser::parse_number_with_delimiter_logic( - &self.stream_buffer, - start_pos, - from_container_end, - at_document_end, - ) - } - fn current_position(&self) -> usize { - self.stream_buffer.current_position() + /// Get read-only access to in_escape_sequence flag + pub fn in_escape_sequence(&self) -> bool { + self.in_escape_sequence } - fn is_exhausted(&self) -> bool { - self.stream_buffer.is_empty() + /// Get read-only access to unicode_escape_collector + pub fn unicode_escape_collector(&self) -> &UnicodeEscapeCollector { + &self.unicode_escape_collector } } @@ -293,8 +187,23 @@ impl ContentExtractor for StreamContentBuilder<'_> { // Use shared number parsing with StreamParser-specific document end detection // StreamParser uses state-based detection: finished flag indicates true document end let at_document_end = self.finished; - log::debug!("[NEW] extract_number_content: start_pos={}, from_container_end={}, finished={}, at_document_end={}", - start_pos, from_container_end, self.finished, at_document_end); + crate::number_parser::parse_number_with_delimiter_logic( + &self.stream_buffer, + start_pos, + from_container_end, + at_document_end, + ) + } + + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + // Use shared number parsing with StreamParser-specific document end detection + // StreamParser uses state-based detection: finished flag indicates true document end + let at_document_end = finished; crate::number_parser::parse_number_with_delimiter_logic( &self.stream_buffer, start_pos, @@ -344,13 +253,12 @@ impl EscapeHandler for StreamContentBuilder<'_> { }; let mut utf8_buf = [0u8; 4]; - let (utf8_bytes_opt, escape_start_pos) = - crate::escape_processor::process_unicode_escape_sequence( - current_pos, - &mut self.unicode_escape_collector, - hex_slice_provider, - &mut utf8_buf, - )?; + let (utf8_bytes_opt, _) = crate::escape_processor::process_unicode_escape_sequence( + current_pos, + &mut self.unicode_escape_collector, + hex_slice_provider, + &mut utf8_buf, + )?; // Copy UTF-8 bytes to avoid borrow conflicts utf8_bytes_opt.map(|bytes| { let mut copy = [0u8; 4]; @@ -386,31 +294,8 @@ impl EscapeHandler for StreamContentBuilder<'_> { .map_err(ParseError::from) } - fn append_literal_byte(&mut self, byte: u8) -> Result<(), ParseError> { - // Check if we're in a string or key state and should accumulate bytes - let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); - - if in_string_mode { - // CRITICAL FIX: Follow old implementation pattern - do NOT write to escape buffer - // when inside ANY escape sequence (in_escape_sequence == true) - // This prevents hex digits from being accumulated as literal text - if !self.in_escape_sequence - && self.stream_buffer.has_unescaped_content() - && !self.unicode_escape_collector.has_pending_high_surrogate() - { - self.stream_buffer - .append_unescaped_byte(byte) - .map_err(ParseError::from)?; - } - } - - Ok(()) - } - fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { - log::debug!("[NEW] EscapeHandler::begin_escape_sequence() called"); self.in_escape_sequence = true; - // Delegate to ContentBuilder implementation - ContentBuilder::begin_escape_sequence(self) + self.start_escape_processing() } } diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index 5fd910e..79fefd6 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -3,7 +3,7 @@ use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::parse_error::ParseError; use crate::parser_core::ParserCore; -use crate::shared::{ByteProvider, Event}; +use crate::shared::{ByteProvider, Event, State}; use crate::stream_content_builder::StreamContentBuilder; use crate::{ujson, PullParser}; @@ -92,14 +92,6 @@ impl StreamParser<'_, R, C> { loop { while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { if let Some(byte) = self.next_byte()? { - let pos_before = self.content_builder.current_position(); - log::trace!( - "[NEW] Processing byte '{}' ({}) at pos {}", - byte as char, - byte, - pos_before - ); - crate::event_processor::process_byte_through_tokenizer( byte, &mut self.parser_core.tokenizer, @@ -108,11 +100,9 @@ impl StreamParser<'_, R, C> { let has_events = crate::event_processor::have_events(&self.parser_core.parser_state.evts); - log::trace!("[NEW] After tokenizer: has_events={has_events}"); // Handle byte accumulation if no event was generated (StreamParser-specific) if !has_events { - log::trace!("[NEW] No events generated, calling handle_byte_accumulation"); self.handle_byte_accumulation(byte)?; } } else { @@ -210,8 +200,8 @@ impl StreamParser<'_, R, C> { *self.content_builder.parser_state_mut() = crate::shared::State::None; - // Use the new ContentBuilder::extract_number method with finished state - use crate::content_builder::ContentBuilder; + // Use the ContentExtractor::extract_number method with finished state + use crate::event_processor::ContentExtractor; self.content_builder .extract_number(start_pos, from_container_end, self.finished) } @@ -373,9 +363,30 @@ impl StreamParser<'_, R, C> { /// Handle byte accumulation for strings/keys and Unicode escape sequences fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), ParseError> { - // Use ContentBuilder's literal byte append logic - use crate::content_builder::ContentBuilder; - ContentBuilder::append_literal_byte(&mut self.content_builder, byte) + // Check if we're in a string or key state and should accumulate bytes + let in_string_mode = matches!( + *self.content_builder.parser_state(), + State::String(_) | State::Key(_) + ); + + if in_string_mode { + // Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.content_builder.in_escape_sequence() + && self.content_builder.stream_buffer().has_unescaped_content() + && !self + .content_builder + .unicode_escape_collector() + .has_pending_high_surrogate() + { + self.content_builder + .stream_buffer_mut() + .append_unescaped_byte(byte) + .map_err(ParseError::from)?; + } + } + Ok(()) } } @@ -488,7 +499,7 @@ mod tests { assert!(matches!(event, Event::EndDocument)); } - #[test_log::test] + #[test] fn test_direct_parser_simple_escape() { let json = b"\"hello\\nworld\""; let reader = SliceReader::new(json); diff --git a/picojson/tests/debug_root_numbers.rs b/picojson/tests/debug_root_numbers.rs deleted file mode 100644 index e19feed..0000000 --- a/picojson/tests/debug_root_numbers.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Debug root-level number parsing issue -use picojson::{ChunkReader, Event, PullParser, SliceParser, StreamParser}; - -fn test_json(input: &str, description: &str) { - println!("\n=== Testing: {} ===", description); - println!("Input: '{}'", input); - - let mut scratch = [0u8; 1024]; - let mut parser = SliceParser::with_buffer(input, &mut scratch); - - let mut event_count = 0; - loop { - match parser.next_event() { - Ok(event) => { - event_count += 1; - println!("Event {}: {:?}", event_count, event); - if matches!(event, Event::EndDocument) { - break; - } - if event_count > 10 { - println!("Too many events, stopping..."); - break; - } - } - Err(e) => { - println!("Error: {:?}", e); - break; - } - } - } - println!("Total events: {}", event_count); -} - -#[test] -fn debug_root_level_numbers() { - // Test root-level primitives - test_json("42", "Root number"); - test_json(r#""hello""#, "Root string"); - test_json("true", "Root boolean true"); - test_json("false", "Root boolean false"); - test_json("null", "Root null"); - - // Compare with structured JSON - test_json(r#"{"value": 42}"#, "Small number in object"); - test_json(r#"{"value": 9999999999}"#, "Large number in object"); - test_json("[42]", "Small number in array"); - test_json("[9999999999]", "Large number in array"); -} - -#[test] -fn debug_lonely_int() { - println!("=== DEBUGGING LONELY INT: '42' ==="); - - // Test with SliceParser with debug logging - println!("\n--- SliceParser with RUST_LOG=debug ---"); - let input = "42"; - let mut scratch = [0u8; 1024]; - let mut parser = SliceParser::with_buffer(input, &mut scratch); - - match parser.next_event() { - Ok(event) => { - println!("SliceParser event: {:?}", event); - if let Event::Number(num) = &event { - println!("Number str: '{}'", num.as_str()); - println!("Number as_int: {:?}", num.as_int()); - } - } - Err(e) => { - println!("SliceParser error: {:?}", e); - } - } - - // Test with StreamParser - println!("\n--- StreamParser ---"); - let input = "42"; - let mut scratch = [0u8; 1024]; - let reader = ChunkReader::new(input.as_bytes(), 1024); - let mut parser = StreamParser::new(reader, &mut scratch); - - match parser.next_event() { - Ok(event) => { - println!("StreamParser event: {:?}", event); - if let Event::Number(num) = &event { - println!("Number str: '{}'", num.as_str()); - println!("Number as_int: {:?}", num.as_int()); - } - } - Err(e) => { - println!("StreamParser error: {:?}", e); - } - } -} - -#[test] -fn debug_lonely_negative_real() { - println!("=== DEBUGGING LONELY NEGATIVE REAL: '-0.1' ==="); - - // Test with SliceParser - println!("\n--- SliceParser ---"); - let input = "-0.1"; - let mut scratch = [0u8; 1024]; - let mut parser = SliceParser::with_buffer(input, &mut scratch); - - match parser.next_event() { - Ok(event) => { - println!("SliceParser event: {:?}", event); - if let Event::Number(num) = &event { - println!("Number str: '{}'", num.as_str()); - println!("Number as_int: {:?}", num.as_int()); - #[cfg(feature = "float")] - println!("Number as_f64: {:?}", num.as_f64()); - } - } - Err(e) => { - println!("SliceParser error: {:?}", e); - } - } - - // Test with StreamParser - println!("\n--- StreamParser ---"); - let input = "-0.1"; - let mut scratch = [0u8; 1024]; - let reader = ChunkReader::new(input.as_bytes(), 1024); - let mut parser = StreamParser::new(reader, &mut scratch); - - match parser.next_event() { - Ok(event) => { - println!("StreamParser event: {:?}", event); - if let Event::Number(num) = &event { - println!("Number str: '{}'", num.as_str()); - println!("Number as_int: {:?}", num.as_int()); - #[cfg(feature = "float")] - println!("Number as_f64: {:?}", num.as_f64()); - } - } - Err(e) => { - println!("StreamParser error: {:?}", e); - } - } -} diff --git a/picojson/tests/debug_stress_case.rs b/picojson/tests/debug_stress_case.rs deleted file mode 100644 index c50dd96..0000000 --- a/picojson/tests/debug_stress_case.rs +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -use picojson::{ChunkReader, Event, PullParser, StreamParser}; - -#[test] -fn debug_specific_failure_case() { - // Test the specific failing case: buffer=20, chunk=1 - let json = br#"{"hello": "world", "count": 42}"#; - println!("JSON: {:?}", std::str::from_utf8(json).unwrap()); - println!("JSON length: {} bytes", json.len()); - - let reader = ChunkReader::new(json, 1); // 1-byte chunks - let mut buffer = vec![0u8; 20]; // 20-byte buffer - let mut parser = StreamParser::new(reader, &mut buffer); - - println!("\nParsing events:"); - for i in 0..10 { - match parser.next_event() { - Ok(event) => { - println!("Event {}: {:?}", i, event); - if matches!(event, Event::EndDocument) { - break; - } - } - Err(e) => { - println!("Error at event {}: {:?}", i, e); - break; - } - } - } -} diff --git a/picojson/tests/stream_parser_stress_test.rs b/picojson/tests/stream_parser_stress_test.rs index d0586b0..f3365d8 100644 --- a/picojson/tests/stream_parser_stress_test.rs +++ b/picojson/tests/stream_parser_stress_test.rs @@ -330,7 +330,7 @@ fn test_stress_matrix_critical_sizes() { } } -#[test_log::test] +#[test] fn test_stress_variable_read_sizes() { let scenarios = get_test_scenarios(); let patterns: &[&[usize]] = &[&[1, 5, 2], &[7, 1, 1, 10], &[1]]; From a9286bee56bb2d1e5e2796bd7c2e99dc6d912df3 Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Tue, 15 Jul 2025 00:30:28 -0700 Subject: [PATCH 07/10] More cleaning --- picojson/src/parser_core.rs | 28 ++ picojson/src/stream_buffer.rs | 9 +- picojson/src/stream_content_builder.rs | 280 +++++++++++++++- picojson/src/stream_parser.rs | 437 ++++++++----------------- 4 files changed, 436 insertions(+), 318 deletions(-) diff --git a/picojson/src/parser_core.rs b/picojson/src/parser_core.rs index 8bc3ffb..8e9442c 100644 --- a/picojson/src/parser_core.rs +++ b/picojson/src/parser_core.rs @@ -49,6 +49,21 @@ impl ParserCore { ) -> Result, ParseError> where P: ParserProvider, + { + self.next_event_impl_unified_with_accumulator(provider, escape_timing, |_, _| Ok(())) + } + + /// Unified implementation with optional byte accumulation callback. + /// This supports StreamParser-specific byte accumulation when no events are generated. + pub fn next_event_impl_unified_with_accumulator<'a, P, F>( + &mut self, + provider: &'a mut P, + escape_timing: EscapeTiming, + mut byte_accumulator: F, + ) -> Result, ParseError> + where + P: ParserProvider, + F: FnMut(&mut P, u8) -> Result<(), ParseError>, { loop { while !have_events(&self.parser_state.evts) { @@ -58,7 +73,14 @@ impl ParserCore { &mut self.tokenizer, &mut self.parser_state.evts, )?; + + // Call byte accumulator if no events were generated (StreamParser-specific) + if !have_events(&self.parser_state.evts) { + byte_accumulator(provider, byte)?; + } } else { + // Handle end of stream - let the provider handle any cleanup + // For StreamParser, this is where finished flag gets set finish_tokenizer(&mut self.tokenizer, &mut self.parser_state.evts)?; if !have_events(&self.parser_state.evts) { @@ -133,6 +155,12 @@ impl ParserCore { } } +impl Default for ParserCore { + fn default() -> Self { + Self::new() + } +} + /// Enum to specify when escape sequences should be processed #[derive(Debug, Clone, Copy, PartialEq)] pub enum EscapeTiming { diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index ead407d..577ff66 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -146,10 +146,14 @@ impl<'a> StreamBuffer<'a> { return Err(StreamBufferError::BufferFull); } - let src_range = start_offset..start_offset.wrapping_add(span_len); - if src_range.end > self.buffer.len() { + let src_range_end = start_offset + .checked_add(span_len) + .ok_or(StreamBufferError::InvalidSliceBounds)?; + + if src_range_end > self.buffer.len() { return Err(StreamBufferError::InvalidSliceBounds); } + let src_range = start_offset..src_range_end; // Copy within the same buffer: move data from [start_offset..end] to [0..span_len] // Use our panic-free copy implementation @@ -157,6 +161,7 @@ impl<'a> StreamBuffer<'a> { } // Update positions + let _old_tokenize_pos = self.tokenize_pos; self.tokenize_pos = self.tokenize_pos.saturating_sub(offset); self.data_end = remaining_data; diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index c3f511d..42946f0 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -40,6 +40,85 @@ impl<'b> StreamContentBuilder<'b> { } } + /// Fill the buffer from a reader + pub fn fill_buffer_from_reader( + &mut self, + reader: &mut R, + ) -> Result<(), crate::ParseError> { + // If buffer is full, try to compact it first (original compaction logic) + if self.stream_buffer.get_fill_slice().is_none() { + // Buffer is full - ALWAYS attempt compaction + let compact_start_pos = match self.parser_state { + crate::shared::State::Number(start_pos) => start_pos, + crate::shared::State::Key(start_pos) => start_pos, + crate::shared::State::String(start_pos) => start_pos, + _ => self.stream_buffer.current_position(), + }; + + let compaction_offset = self + .stream_buffer + .compact_from(compact_start_pos) + .map_err(crate::ParseError::from)?; + + if compaction_offset == 0 { + // SOL: Buffer too small for current token + return Err(crate::ParseError::ScratchBufferFull); + } + + // Update parser state positions after compaction (original logic) + self.update_positions_after_compaction(compaction_offset)?; + } + + if let Some(fill_slice) = self.stream_buffer.get_fill_slice() { + let bytes_read = reader + .read(fill_slice) + .map_err(|_| crate::ParseError::ReaderError)?; + self.stream_buffer + .mark_filled(bytes_read) + .map_err(crate::ParseError::from)?; + } + Ok(()) + } + + /// Update parser state positions after compaction (original logic) + fn update_positions_after_compaction( + &mut self, + compaction_offset: usize, + ) -> Result<(), crate::ParseError> { + // Update positions - since we compact from the token start position, + // positions should not be discarded in normal operation + match &mut self.parser_state { + crate::shared::State::None => { + // No position-based state to update + } + crate::shared::State::Key(pos) => { + if *pos >= compaction_offset { + *pos = pos.checked_sub(compaction_offset).unwrap_or(0); + } else { + // This shouldn't happen since we compact from the token start + *pos = 0; + } + } + crate::shared::State::String(pos) => { + if *pos >= compaction_offset { + *pos = pos.checked_sub(compaction_offset).unwrap_or(0); + } else { + // This shouldn't happen since we compact from the token start + *pos = 0; + } + } + crate::shared::State::Number(pos) => { + if *pos >= compaction_offset { + *pos = pos.checked_sub(compaction_offset).unwrap_or(0); + } else { + // This shouldn't happen since we compact from the token start + *pos = 0; + } + } + } + Ok(()) + } + /// Get access to the stream buffer for byte operations pub fn stream_buffer(&self) -> &StreamBuffer<'b> { &self.stream_buffer @@ -128,16 +207,6 @@ impl<'b> StreamContentBuilder<'b> { Ok(()) } - - /// Get read-only access to in_escape_sequence flag - pub fn in_escape_sequence(&self) -> bool { - self.in_escape_sequence - } - - /// Get read-only access to unicode_escape_collector - pub fn unicode_escape_collector(&self) -> &UnicodeEscapeCollector { - &self.unicode_escape_collector - } } impl ContentExtractor for StreamContentBuilder<'_> { @@ -215,7 +284,10 @@ impl ContentExtractor for StreamContentBuilder<'_> { impl crate::shared::ByteProvider for StreamContentBuilder<'_> { fn next_byte(&mut self) -> Result, crate::ParseError> { - // If buffer is empty, cannot provide bytes (let StreamParser handle filling) + // This implementation doesn't have access to the reader + // It relies on StreamParser to fill the buffer before calling the unified method + + // If buffer is empty, cannot provide bytes if self.stream_buffer.is_empty() { return Ok(None); } @@ -223,10 +295,196 @@ impl crate::shared::ByteProvider for StreamContentBuilder<'_> { // Get byte and advance let byte = self.stream_buffer.current_byte()?; self.stream_buffer.advance()?; + Ok(Some(byte)) } } +/// Custom ByteProvider that can handle reader filling for StreamParser +pub struct StreamContentBuilderWithFiller<'a, 'b, R: crate::stream_parser::Reader> { + content_builder: &'a mut StreamContentBuilder<'b>, + reader: &'a mut R, + finished: &'a mut bool, +} + +impl StreamContentBuilderWithFiller<'_, '_, R> {} + +impl crate::shared::ByteProvider + for StreamContentBuilderWithFiller<'_, '_, R> +{ + fn next_byte(&mut self) -> Result, crate::ParseError> { + // If buffer is empty, try to fill it first + if self.content_builder.stream_buffer.is_empty() { + self.content_builder.fill_buffer_from_reader(self.reader)?; + } + + // If still empty after fill attempt, we're at EOF + if self.content_builder.stream_buffer.is_empty() { + // Set finished flag when we reach end of stream + if !*self.finished { + *self.finished = true; + self.content_builder.set_finished(true); + } + return Ok(None); + } + + // Get byte and advance + let byte = self.content_builder.stream_buffer.current_byte()?; + self.content_builder.stream_buffer.advance()?; + Ok(Some(byte)) + } +} + +/// Provider that can fill the buffer from a reader +pub struct StreamContentBuilderWithReader<'a, 'b, R: crate::stream_parser::Reader> { + pub content_builder: &'a mut StreamContentBuilder<'b>, + reader: &'a mut R, + finished: &'a mut bool, +} + +impl StreamContentBuilderWithReader<'_, '_, R> {} + +impl crate::shared::ByteProvider + for StreamContentBuilderWithReader<'_, '_, R> +{ + fn next_byte(&mut self) -> Result, crate::ParseError> { + // If buffer is empty, try to fill it first + if self.content_builder.stream_buffer.is_empty() { + if let Some(fill_slice) = self.content_builder.stream_buffer.get_fill_slice() { + let bytes_read = self + .reader + .read(fill_slice) + .map_err(|_| crate::ParseError::ReaderError)?; + self.content_builder + .stream_buffer + .mark_filled(bytes_read) + .map_err(crate::ParseError::from)?; + } + } + + // If still empty after fill attempt, we're at EOF + if self.content_builder.stream_buffer.is_empty() { + // Set finished flag when we reach end of stream + if !*self.finished { + *self.finished = true; + self.content_builder.set_finished(true); + } + return Ok(None); + } + + // Get byte and advance + let byte = self.content_builder.stream_buffer.current_byte()?; + self.content_builder.stream_buffer.advance()?; + Ok(Some(byte)) + } +} + +impl crate::event_processor::ContentExtractor + for StreamContentBuilderWithReader<'_, '_, R> +{ + fn parser_state_mut(&mut self) -> &mut crate::shared::State { + self.content_builder.parser_state_mut() + } + + fn current_position(&self) -> usize { + self.content_builder.current_position() + } + + fn begin_string_content(&mut self, pos: usize) { + self.content_builder.begin_string_content(pos); + } + + fn unicode_escape_collector_mut( + &mut self, + ) -> &mut crate::escape_processor::UnicodeEscapeCollector { + self.content_builder.unicode_escape_collector_mut() + } + + fn extract_string_content( + &mut self, + start_pos: usize, + ) -> Result, crate::ParseError> { + self.content_builder.extract_string_content(start_pos) + } + + fn extract_key_content( + &mut self, + start_pos: usize, + ) -> Result, crate::ParseError> { + self.content_builder.extract_key_content(start_pos) + } + + fn extract_number_content( + &mut self, + start_pos: usize, + from_container_end: bool, + ) -> Result, crate::ParseError> { + self.content_builder + .extract_number_content(start_pos, from_container_end) + } + + fn extract_number( + &mut self, + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, crate::ParseError> { + self.content_builder + .extract_number(start_pos, from_container_end, finished) + } +} + +impl crate::event_processor::EscapeHandler + for StreamContentBuilderWithReader<'_, '_, R> +{ + fn parser_state(&self) -> &crate::shared::State { + self.content_builder.parser_state() + } + + fn process_unicode_escape_with_collector(&mut self) -> Result<(), crate::ParseError> { + self.content_builder.process_unicode_escape_with_collector() + } + + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), crate::ParseError> { + self.content_builder.handle_simple_escape_char(escape_char) + } + + fn begin_escape_sequence(&mut self) -> Result<(), crate::ParseError> { + self.content_builder.begin_escape_sequence() + } + + fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { + self.content_builder.begin_unicode_escape() + } +} + +impl StreamContentBuilder<'_> { + /// Handle byte accumulation for StreamParser-specific requirements + /// This method is called when a byte doesn't generate any events + pub fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), crate::ParseError> { + // Check if we're in a string or key state and should accumulate bytes + let in_string_mode = matches!(self.parser_state, State::String(_) | State::Key(_)); + + if in_string_mode { + // When unescaped content is active, we need to accumulate ALL string content + // This includes both regular characters and content after escape sequences + if self.stream_buffer.has_unescaped_content() { + // Follow old implementation pattern - do NOT write to escape buffer + // when inside ANY escape sequence (in_escape_sequence == true) + // This prevents hex digits from being accumulated as literal text + if !self.in_escape_sequence + && !self.unicode_escape_collector.has_pending_high_surrogate() + { + self.stream_buffer + .append_unescaped_byte(byte) + .map_err(crate::ParseError::from)?; + } + } + } + Ok(()) + } +} + impl EscapeHandler for StreamContentBuilder<'_> { fn parser_state(&self) -> &State { &self.parser_state diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index 79fefd6..9cb82dd 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -32,12 +32,9 @@ pub trait Reader { pub struct StreamParser<'b, R: Reader, C: BitStackConfig = DefaultConfig> { /// The shared parser core that handles the unified event processing loop parser_core: ParserCore, - /// The content builder that handles StreamParser-specific content extraction - content_builder: StreamContentBuilder<'b>, - /// Reader for streaming input - reader: R, - /// Flag to track if input is finished - finished: bool, + /// The unified provider that handles both content building and reader access + /// This allows us to use the same unified pattern as SliceParser + provider: StreamParserProvider<'b, R>, } /// Methods for StreamParser using DefaultConfig @@ -73,6 +70,26 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> { pub fn with_config(reader: R, buffer: &'b mut [u8]) -> Self { Self { parser_core: ParserCore::new(), + provider: StreamParserProvider::new(reader, buffer), + } + } +} + +/// Implement the required traits for StreamParser to work with unified ParserCore +/// This provider handles the StreamParser-specific operations needed by the unified parser core +/// It bridges the gap between the generic ParserCore and the StreamParser's specific requirements +/// for streaming input and buffer management +/// The provider contains mutable references to the StreamParser's internal state +/// which allows the unified parser core to control the parsing process +pub struct StreamParserProvider<'b, R: Reader> { + content_builder: StreamContentBuilder<'b>, + reader: R, + finished: bool, +} + +impl<'b, R: Reader> StreamParserProvider<'b, R> { + pub fn new(reader: R, buffer: &'b mut [u8]) -> Self { + Self { content_builder: StreamContentBuilder::new(buffer), reader, finished: false, @@ -80,340 +97,150 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> { } } -/// Shared methods for StreamParser with any BitStackConfig -impl StreamParser<'_, R, C> { - /// Get the next JSON event from the stream - fn next_event_impl(&mut self) -> Result, ParseError> { - // StreamParser has special requirements that prevent using ParserCore directly: - // 1. Byte accumulation logic when no events are generated - // 2. Buffer filling and compaction logic - // 3. Complex state management across buffer boundaries - // For now, use a custom loop that handles these StreamParser-specific needs - loop { - while !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { - if let Some(byte) = self.next_byte()? { - crate::event_processor::process_byte_through_tokenizer( - byte, - &mut self.parser_core.tokenizer, - &mut self.parser_core.parser_state.evts, - )?; - - let has_events = - crate::event_processor::have_events(&self.parser_core.parser_state.evts); - - // Handle byte accumulation if no event was generated (StreamParser-specific) - if !has_events { - self.handle_byte_accumulation(byte)?; - } - } else { - // Handle end of data with tokenizer finish - if !self.finished { - self.finished = true; - self.content_builder.set_finished(true); - crate::event_processor::finish_tokenizer( - &mut self.parser_core.tokenizer, - &mut self.parser_core.parser_state.evts, - )?; - } - - if !crate::event_processor::have_events(&self.parser_core.parser_state.evts) { - return Ok(Event::EndDocument); - } - } - } - - let taken_event = - crate::event_processor::take_first_event(&mut self.parser_core.parser_state.evts); - let Some(taken) = taken_event else { - return Err(crate::shared::UnexpectedState::StateMismatch.into()); - }; - - // Try shared event processors first - if let Some(result) = - crate::event_processor::process_simple_events(&taken).or_else(|| { - crate::event_processor::process_begin_events(&taken, &mut self.content_builder) - }) - { - match result { - crate::event_processor::EventResult::Complete(event) => return Ok(event), - crate::event_processor::EventResult::ExtractString => { - return self.content_builder.validate_and_extract_string() - } - crate::event_processor::EventResult::ExtractKey => { - return self.content_builder.validate_and_extract_key() - } - crate::event_processor::EventResult::ExtractNumber(from_container_end) => { - return self.extract_number_with_finished_state(from_container_end) - } - crate::event_processor::EventResult::Continue => continue, - } - } +impl ByteProvider for StreamParserProvider<'_, R> { + fn next_byte(&mut self) -> Result, ParseError> { + // If buffer is empty, try to fill it first + if self.content_builder.stream_buffer().is_empty() { + self.content_builder + .fill_buffer_from_reader(&mut self.reader)?; + } - // Handle parser-specific events (StreamParser uses OnEnd timing) - match taken { - ujson::Event::Begin(crate::ujson::EventToken::EscapeSequence) => { - crate::event_processor::process_begin_escape_sequence_event( - &mut self.content_builder, - )?; - } - _ if crate::event_processor::process_unicode_escape_events( - &taken, - &mut self.content_builder, - )? => - { - // Unicode escape events handled by shared function - } - ujson::Event::End( - escape_token @ (crate::ujson::EventToken::EscapeQuote - | crate::ujson::EventToken::EscapeBackslash - | crate::ujson::EventToken::EscapeSlash - | crate::ujson::EventToken::EscapeBackspace - | crate::ujson::EventToken::EscapeFormFeed - | crate::ujson::EventToken::EscapeNewline - | crate::ujson::EventToken::EscapeCarriageReturn - | crate::ujson::EventToken::EscapeTab), - ) => { - // StreamParser-specific: Handle simple escape sequences on End events - // because StreamBuffer must wait until the token ends to accumulate - // all bytes before processing the complete escape sequence - crate::event_processor::process_simple_escape_event( - &escape_token, - &mut self.content_builder, - )?; - } - _ => { - // All other events continue to next iteration - } + // If still empty after fill attempt, we're at EOF + if self.content_builder.stream_buffer().is_empty() { + if !self.finished { + self.finished = true; + self.content_builder.set_finished(true); } + return Ok(None); } - } - /// Extract number with proper StreamParser document end detection - fn extract_number_with_finished_state( - &mut self, - from_container_end: bool, - ) -> Result, ParseError> { - let start_pos = match *self.content_builder.parser_state() { - crate::shared::State::Number(pos) => pos, - _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()), - }; + // Get byte and advance + let byte = self.content_builder.stream_buffer().current_byte()?; + let _current_pos = self.content_builder.stream_buffer().current_position(); + self.content_builder.stream_buffer_mut().advance()?; + Ok(Some(byte)) + } +} - *self.content_builder.parser_state_mut() = crate::shared::State::None; +impl EscapeHandler for StreamParserProvider<'_, R> { + fn parser_state(&self) -> &State { + self.content_builder.parser_state() + } - // Use the ContentExtractor::extract_number method with finished state - use crate::event_processor::ContentExtractor; - self.content_builder - .extract_number(start_pos, from_container_end, self.finished) + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + self.content_builder.process_unicode_escape_with_collector() } - /// Fill buffer from reader - fn fill_buffer_from_reader(&mut self) -> Result<(), ParseError> { - if let Some(fill_slice) = self.content_builder.stream_buffer_mut().get_fill_slice() { - let bytes_read = self - .reader - .read(fill_slice) - .map_err(|_| ParseError::ReaderError)?; + fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), ParseError> { + self.content_builder.handle_simple_escape_char(escape_char) + } - self.content_builder - .stream_buffer_mut() - .mark_filled(bytes_read)?; - } else { - // Buffer is full - ALWAYS attempt compaction - let compact_start_pos = match *self.content_builder.parser_state_mut() { - crate::shared::State::Number(start_pos) => start_pos, - crate::shared::State::Key(start_pos) => start_pos, - crate::shared::State::String(start_pos) => start_pos, - _ => self.content_builder.stream_buffer().current_position(), - }; - - let offset = self - .content_builder - .stream_buffer_mut() - .compact_from(compact_start_pos)?; - - if offset == 0 { - // SOL: Buffer too small for current token - return Err(ParseError::ScratchBufferFull); - } + fn begin_escape_sequence(&mut self) -> Result<(), ParseError> { + self.content_builder.begin_escape_sequence() + } - // Update parser state positions - self.update_positions_after_compaction(offset)?; + fn begin_unicode_escape(&mut self) -> Result<(), ParseError> { + self.content_builder.begin_unicode_escape() + } +} - // Try to fill again after compaction - if let Some(fill_slice) = self.content_builder.stream_buffer_mut().get_fill_slice() { - let bytes_read = self - .reader - .read(fill_slice) - .map_err(|_| ParseError::ReaderError)?; +impl ContentExtractor for StreamParserProvider<'_, R> { + fn parser_state_mut(&mut self) -> &mut State { + self.content_builder.parser_state_mut() + } - self.content_builder - .stream_buffer_mut() - .mark_filled(bytes_read)?; - } - } - Ok(()) + fn current_position(&self) -> usize { + self.content_builder.current_position() } - /// Update parser state positions after buffer compaction - fn update_positions_after_compaction(&mut self, offset: usize) -> Result<(), ParseError> { - // Check for positions that would be discarded and need escape mode - // CRITICAL: Position 0 is never discarded, regardless of offset - let needs_escape_mode = match self.content_builder.parser_state_mut() { - crate::shared::State::Key(pos) if *pos > 0 && *pos < offset => Some((*pos, true)), // true = is_key - crate::shared::State::String(pos) if *pos > 0 && *pos < offset => Some((*pos, false)), // false = is_string - crate::shared::State::Number(pos) if *pos > 0 && *pos < offset => { - return Err(ParseError::ScratchBufferFull); - } - _ => None, - }; + fn begin_string_content(&mut self, pos: usize) { + self.content_builder.begin_string_content(pos); + } - // Handle escape mode transition if needed - if let Some((original_pos, is_key)) = needs_escape_mode { - if is_key { - self.switch_key_to_escape_mode(original_pos, offset)?; - } else { - self.switch_string_to_escape_mode(original_pos, offset)?; - } - } + fn unicode_escape_collector_mut( + &mut self, + ) -> &mut crate::escape_processor::UnicodeEscapeCollector { + self.content_builder.unicode_escape_collector_mut() + } - // Update positions - match self.content_builder.parser_state_mut() { - crate::shared::State::None => { - // No position-based state to update - } - crate::shared::State::Key(pos) => { - if *pos > 0 && *pos < offset { - *pos = 0; // Reset for escape mode - } else if *pos >= offset { - *pos = pos.checked_sub(offset).unwrap_or(0); // Safe position adjustment - } - // else: *pos == 0 or *pos < offset with pos == 0, keep as-is - } - crate::shared::State::String(pos) => { - if *pos > 0 && *pos < offset { - *pos = 0; // Reset for escape mode - } else if *pos >= offset { - *pos = pos.checked_sub(offset).unwrap_or(0); // Safe position adjustment - } - // else: *pos == 0 or *pos < offset with pos == 0, keep as-is - } - crate::shared::State::Number(pos) => { - if *pos >= offset { - *pos = pos.checked_sub(offset).unwrap_or(0); // Safe position adjustment - } else { - *pos = 0; // Reset for discarded number start - } - } - } + fn extract_string_content(&mut self, start_pos: usize) -> Result, ParseError> { + self.content_builder.extract_string_content(start_pos) + } - Ok(()) + fn extract_key_content(&mut self, start_pos: usize) -> Result, ParseError> { + self.content_builder.extract_key_content(start_pos) } - /// Switch key processing to escape/copy mode when original position was discarded - fn switch_key_to_escape_mode( + fn extract_number_content( &mut self, - original_pos: usize, - offset: usize, - ) -> Result<(), ParseError> { - // The key start position was in the discarded portion of the buffer - - // For keys, the original_pos now points to the content start (after opening quote) - // If offset > original_pos, it means some actual content was discarded - - // Calculate how much actual key content was discarded - let content_start = original_pos; // Key content starts at original_pos (now tracks content directly) - let discarded_content = offset.saturating_sub(content_start); - - if discarded_content > 0 { - // We lost some actual key content - this would require content recovery - // For now, this is unsupported - return Err(ParseError::ScratchBufferFull); - } - - // No actual content was discarded, we can continue parsing - // We can continue parsing the key from the current position - Ok(()) + start_pos: usize, + from_container_end: bool, + ) -> Result, ParseError> { + self.content_builder + .extract_number_content(start_pos, from_container_end) } - /// Switch string processing to escape/copy mode when original position was discarded - fn switch_string_to_escape_mode( + fn extract_number( &mut self, - original_pos: usize, - offset: usize, - ) -> Result<(), ParseError> { - // The string start position was in the discarded portion of the buffer - - // For strings, the original_pos now points to the content start (after opening quote) - // If offset > original_pos, it means some actual content was discarded - - // Calculate how much actual string content was discarded - let content_start = original_pos; // String content starts at original_pos (now tracks content directly) - let discarded_content = offset.saturating_sub(content_start); - - if discarded_content > 0 { - // We lost some actual string content - this would require content recovery - // For now, this is unsupported - return Err(ParseError::ScratchBufferFull); - } - - // No actual content was discarded, we can continue parsing - // We can continue parsing the string from the current position - Ok(()) + start_pos: usize, + from_container_end: bool, + finished: bool, + ) -> Result, ParseError> { + self.content_builder + .extract_number(start_pos, from_container_end, finished) } - /// Handle byte accumulation for strings/keys and Unicode escape sequences - fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), ParseError> { - // Check if we're in a string or key state and should accumulate bytes - let in_string_mode = matches!( - *self.content_builder.parser_state(), - State::String(_) | State::Key(_) - ); + /// Override the default validate_and_extract_number to use the finished state + fn validate_and_extract_number( + &mut self, + from_container_end: bool, + ) -> Result, ParseError> { + use crate::event_processor::EscapeHandler; + let start_pos = match *self.parser_state() { + crate::shared::State::Number(pos) => pos, + _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()), + }; - if in_string_mode { - // Follow old implementation pattern - do NOT write to escape buffer - // when inside ANY escape sequence (in_escape_sequence == true) - // This prevents hex digits from being accumulated as literal text - if !self.content_builder.in_escape_sequence() - && self.content_builder.stream_buffer().has_unescaped_content() - && !self - .content_builder - .unicode_escape_collector() - .has_pending_high_surrogate() - { - self.content_builder - .stream_buffer_mut() - .append_unescaped_byte(byte) - .map_err(ParseError::from)?; - } - } - Ok(()) + *self.parser_state_mut() = crate::shared::State::None; + // Use the finished-aware extract_number method instead of extract_number_content + self.extract_number(start_pos, from_container_end, self.finished) } } -impl PullParser for StreamParser<'_, R, C> { - fn next_event(&mut self) -> Result, ParseError> { - self.content_builder.apply_unescaped_reset_if_queued(); - - self.next_event_impl() +/// Shared methods for StreamParser with any BitStackConfig +impl StreamParser<'_, R, C> { + /// Get the next JSON event from the stream + fn next_event_impl(&mut self) -> Result, ParseError> { + // Use the unified ParserCore implementation with StreamParser-specific timing + // This achieves the same pattern as SliceParser: self.parser_core.next_event_impl_unified() + + // StreamParser needs byte accumulation for string parsing, so use the accumulator version + self.parser_core.next_event_impl_unified_with_accumulator( + &mut self.provider, + crate::parser_core::EscapeTiming::OnEnd, + |provider, byte| { + // Delegate to the StreamContentBuilder's byte accumulation logic + provider.content_builder.handle_byte_accumulation(byte) + }, + ) } + + // The compaction and helper methods are now handled by the provider + // These methods can be removed since they're not needed with the new architecture } -impl crate::shared::ByteProvider for StreamParser<'_, R, C> { - fn next_byte(&mut self) -> Result, ParseError> { - // If buffer is empty, try to fill it first - if self.content_builder.stream_buffer().is_empty() { - self.fill_buffer_from_reader()?; +impl PullParser for StreamParser<'_, R, C> { + fn next_event(&mut self) -> Result, ParseError> { + // Check if we're already finished (similar to SliceParser's is_past_end check) + if self.provider.finished { + return Ok(Event::EndDocument); } - // If still empty after fill attempt, we're at EOF - if self.content_builder.stream_buffer().is_empty() { - return Ok(None); - } + self.provider + .content_builder + .apply_unescaped_reset_if_queued(); - // Get byte and advance - let byte = self.content_builder.stream_buffer().current_byte()?; - self.content_builder.stream_buffer_mut().advance()?; - Ok(Some(byte)) + self.next_event_impl() } } From cdc45a9ebec240777c11a67db95ed4383138177a Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Tue, 15 Jul 2025 00:42:29 -0700 Subject: [PATCH 08/10] Fiiixups --- picojson/src/slice_parser.rs | 23 ---------------------- picojson/src/stream_buffer.rs | 1 - picojson/src/stream_parser.rs | 37 ++++++++--------------------------- 3 files changed, 8 insertions(+), 53 deletions(-) diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index e8e014b..0e98a7f 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -488,22 +488,6 @@ mod tests { #[test] fn test_unicode_escape_integration() { let input = r#"{"key": "Hello\u0041World"}"#; // \u0041 = 'A' - println!("=== SLICE PARSER DEBUG ==="); - println!("Input: {}", input); - println!("Expected in string: 'HelloAWorld'"); - - // Print byte positions for the string part - let string_start = input.find("Hello").unwrap(); - let string_part = &input[string_start..]; - println!("String part: '{}'", string_part); - for (i, &byte) in string_part.as_bytes().iter().enumerate() { - println!( - " pos {}: '{}' ({:02x})", - string_start + i, - byte as char, - byte - ); - } let mut scratch = [0u8; 1024]; let mut parser = SliceParser::with_buffer(input, &mut scratch); @@ -514,13 +498,6 @@ mod tests { // The string with Unicode escape should be unescaped match parser.next_event().unwrap() { Event::String(String::Unescaped(s)) => { - println!("=== RESULT ANALYSIS ==="); - println!("Expected: 'HelloAWorld'"); - println!("Got: '{}'", s); - println!("Character breakdown:"); - for (i, ch) in s.char_indices() { - println!(" [{}] '{}' = U+{:04X}", i, ch, ch as u32); - } assert_eq!(s, "HelloAWorld"); } other => panic!("Expected unescaped string value, got: {:?}", other), diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index 577ff66..11c21ad 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -723,7 +723,6 @@ mod tests { if _string_start_pos < offset { // Original string start was discarded - must use escape/copy mode // In real implementation, parser would copy what it had processed to unescaped buffer - println!("String start was discarded, switching to escape mode"); _string_start_pos = 0; // Reset for escape mode } else { _string_start_pos = _string_start_pos.saturating_sub(offset); // Normal position update diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index 9cb82dd..faf78f3 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -1197,52 +1197,31 @@ mod tests { // JSON: ["hello\\"] = 10 bytes total, should work with ~7 byte buffer let json_stream = br#"["hello\\"]"#; - println!("Testing escape sequence with buffer size: {}", buffer_size); - println!( - "JSON input: {:?}", - core::str::from_utf8(json_stream).unwrap() - ); let mut buffer = vec![0u8; buffer_size]; let mut parser = StreamParser::new(SliceReader::new(json_stream), &mut buffer); // Array start - match parser.next_event()? { - Event::StartArray => println!(" ✅ StartArray OK"), - other => panic!("Expected StartArray, got: {:?}", other), - } + assert!(matches!(parser.next_event()?, Event::StartArray)); // String with escape - match parser.next_event()? { - Event::String(s) => { - println!(" ✅ String OK: '{}'", s.as_str()); - assert_eq!(s.as_str(), "hello\\"); - } - other => panic!("Expected String, got: {:?}", other), - } + assert!(matches!(parser.next_event()?, Event::String(s) if s.as_str() == "hello\\")); // Array end - match parser.next_event()? { - Event::EndArray => println!(" ✅ EndArray OK"), - other => panic!("Expected EndArray, got: {:?}", other), - } + assert!(matches!(parser.next_event()?, Event::EndArray)); // End document - match parser.next_event()? { - Event::EndDocument => println!(" ✅ EndDocument OK"), - other => panic!("Expected EndDocument, got: {:?}", other), - } + assert!(matches!(parser.next_event()?, Event::EndDocument)); - println!(" 🎉 SUCCESS with buffer size: {}", buffer_size); Ok(()) } #[test] fn test_minimal_buffer_simple_escape_1() { // Buffer size 4 - clearly not enough - match test_simple_escape_with_buffer_size(4) { - Ok(()) => panic!("Expected failure with 4-byte buffer, but succeeded!"), - Err(e) => println!("Expected failure with 4-byte buffer: {:?}", e), - } + assert!(matches!( + test_simple_escape_with_buffer_size(4), + Err(crate::ParseError::ScratchBufferFull) + )); } #[test] From 8d596fc77fc957bd87c2f8403255febcf60b4c57 Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Tue, 15 Jul 2025 08:13:16 -0700 Subject: [PATCH 09/10] Cleanups --- picojson/src/stream_buffer.rs | 1 - picojson/tests/stream_parser_stress_test.rs | 47 --------------------- 2 files changed, 48 deletions(-) diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index 11c21ad..c6ae62e 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -161,7 +161,6 @@ impl<'a> StreamBuffer<'a> { } // Update positions - let _old_tokenize_pos = self.tokenize_pos; self.tokenize_pos = self.tokenize_pos.saturating_sub(offset); self.data_end = remaining_data; diff --git a/picojson/tests/stream_parser_stress_test.rs b/picojson/tests/stream_parser_stress_test.rs index f3365d8..1c17637 100644 --- a/picojson/tests/stream_parser_stress_test.rs +++ b/picojson/tests/stream_parser_stress_test.rs @@ -214,53 +214,6 @@ fn test_stress_buffer_sizes_with_full_reads() { } } -#[test] -fn debug_unicode_issue() { - let json = br#"["a\nb\t\"\\c\u1234d"]"#; - let reader = ChunkReader::new(json, 5); - let mut buffer = [0u8; 30]; - - let mut parser = StreamParser::new(reader, &mut buffer); - let mut event_count = 0; - - loop { - let event = match parser.next_event() { - Ok(event) => event, - Err(e) => { - println!("Parse error: {:?}", e); - break; - } - }; - - println!("Event[{}]: {:?}", event_count, event); - - // Check string content when we encounter it - if let Event::String(s) = &event { - match s { - picojson::String::Unescaped(content) => { - println!("String Content Analysis:"); - println!(" Expected: 'a\\nb\\t\\\"\\\\cሴd'"); - println!(" Got: '{}'", content); - println!(" Character codes:"); - for (i, ch) in content.char_indices() { - println!(" [{}] '{}' = U+{:04X}", i, ch, ch as u32); - } - } - picojson::String::Borrowed(content) => { - println!("Borrowed String: '{}'", content); - } - } - } - - event_count += 1; - - match event { - Event::EndDocument => break, - _ => {} - } - } -} - #[test] fn test_stress_chunk_sizes_with_adequate_buffer() { let scenarios = get_test_scenarios(); From cc553ac1190ab9e88ee492d6fc1899aeea04131a Mon Sep 17 00:00:00 2001 From: Kaido Kert Date: Tue, 15 Jul 2025 08:20:46 -0700 Subject: [PATCH 10/10] More cleaning up --- picojson/src/escape_processor.rs | 5 +- picojson/src/event_processor.rs | 3 +- picojson/src/slice_content_builder.rs | 13 +- picojson/src/slice_parser.rs | 1 - picojson/src/stream_buffer.rs | 4 +- picojson/src/stream_content_builder.rs | 158 ------------------------- picojson/src/stream_parser.rs | 1 - 7 files changed, 7 insertions(+), 178 deletions(-) diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs index 2b9e513..c2077c7 100644 --- a/picojson/src/escape_processor.rs +++ b/picojson/src/escape_processor.rs @@ -271,6 +271,7 @@ impl UnicodeEscapeCollector { #[cfg(test)] mod tests { use super::*; + use crate::ujson::EventToken; #[test] fn test_simple_escapes() { @@ -375,8 +376,6 @@ mod tests { #[test] fn test_token_to_escape_char() { - use crate::ujson::EventToken; - // Test all valid escape tokens assert_eq!( EscapeProcessor::token_to_escape_char(&EventToken::EscapeQuote).unwrap(), @@ -420,8 +419,6 @@ mod tests { #[test] fn test_process_escape_token() { - use crate::ujson::EventToken; - // Test valid escape tokens that produce correct unescaped bytes assert_eq!( EscapeProcessor::process_escape_token(&EventToken::EscapeQuote).unwrap(), diff --git a/picojson/src/event_processor.rs b/picojson/src/event_processor.rs index 84065c1..70e5083 100644 --- a/picojson/src/event_processor.rs +++ b/picojson/src/event_processor.rs @@ -5,6 +5,7 @@ //! This module extracts the common event handling patterns to reduce code duplication //! while preserving the performance characteristics of each parser type. +use crate::shared::{ContentRange, State}; use crate::ujson::EventToken; use crate::{Event, ParseError}; @@ -53,8 +54,6 @@ pub fn process_begin_events( event: &crate::ujson::Event, content_extractor: &mut C, ) -> Option> { - use crate::shared::{ContentRange, State}; - match event { // String/Key Begin events - nearly identical patterns crate::ujson::Event::Begin(EventToken::Key) => { diff --git a/picojson/src/slice_content_builder.rs b/picojson/src/slice_content_builder.rs index db65d5f..1a21afd 100644 --- a/picojson/src/slice_content_builder.rs +++ b/picojson/src/slice_content_builder.rs @@ -7,7 +7,7 @@ use crate::escape_processor::UnicodeEscapeCollector; use crate::event_processor::{ContentExtractor, EscapeHandler}; use crate::number_parser::NumberExtractor; use crate::shared::{ContentRange, State}; -use crate::slice_input_buffer::SliceInputBuffer; +use crate::slice_input_buffer::{InputBuffer, SliceInputBuffer}; use crate::ParseError; /// ContentBuilder implementation for SliceParser that uses CopyOnEscape for zero-copy optimization @@ -131,7 +131,6 @@ impl ContentExtractor for SliceContentBuilder<'_, '_> { impl crate::shared::ByteProvider for SliceContentBuilder<'_, '_> { fn next_byte(&mut self) -> Result, crate::ParseError> { - use crate::slice_input_buffer::InputBuffer; match self.buffer_mut().consume_byte() { Ok(byte) => Ok(Some(byte)), Err(crate::slice_input_buffer::Error::ReachedEnd) => Ok(None), @@ -162,24 +161,20 @@ impl EscapeHandler for SliceContentBuilder<'_, '_> { hex_slice_provider, &mut utf8_buf, )?; - // Fix: escape_start_pos should point to the backslash, not the 'u' - // But don't subtract if it's already pointing to the backslash - let actual_escape_start_pos = escape_start_pos; - // Handle UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) if let Some(utf8_bytes) = utf8_bytes_opt { if had_pending_high_surrogate { // This is completing a surrogate pair - need to consume both escapes // First call: consume the high surrogate (6 bytes earlier) self.copy_on_escape - .handle_unicode_escape(actual_escape_start_pos, &[])?; + .handle_unicode_escape(escape_start_pos, &[])?; // Second call: consume the low surrogate and write UTF-8 self.copy_on_escape - .handle_unicode_escape(actual_escape_start_pos + 6, utf8_bytes)?; + .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; } else { // Single Unicode escape - normal processing self.copy_on_escape - .handle_unicode_escape(actual_escape_start_pos, utf8_bytes)?; + .handle_unicode_escape(escape_start_pos, utf8_bytes)?; } } diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index 0e98a7f..088949b 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -164,7 +164,6 @@ impl PullParser for SliceParser<'_, '_, C> { impl crate::shared::ByteProvider for SliceParser<'_, '_, C> { fn next_byte(&mut self) -> Result, ParseError> { - use crate::slice_input_buffer::InputBuffer; match self.content_builder.buffer_mut().consume_byte() { Ok(byte) => Ok(Some(byte)), Err(crate::slice_input_buffer::Error::ReachedEnd) => Ok(None), diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index c6ae62e..18b0932 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -272,6 +272,7 @@ impl<'a> StreamBuffer<'a> { #[cfg(test)] mod tests { use super::*; + use crate::shared::State; #[test] fn test_lifetime_expectations() { @@ -920,9 +921,6 @@ mod tests { fn test_position_update_state_transitions() { // Test the complete state transition logic for different parser states - // Mock the State enum variants and position update logic - use crate::shared::State; - // Case 1: State::None - no position to update let state = State::None; // No position updates needed for None state diff --git a/picojson/src/stream_content_builder.rs b/picojson/src/stream_content_builder.rs index 42946f0..c163815 100644 --- a/picojson/src/stream_content_builder.rs +++ b/picojson/src/stream_content_builder.rs @@ -300,164 +300,6 @@ impl crate::shared::ByteProvider for StreamContentBuilder<'_> { } } -/// Custom ByteProvider that can handle reader filling for StreamParser -pub struct StreamContentBuilderWithFiller<'a, 'b, R: crate::stream_parser::Reader> { - content_builder: &'a mut StreamContentBuilder<'b>, - reader: &'a mut R, - finished: &'a mut bool, -} - -impl StreamContentBuilderWithFiller<'_, '_, R> {} - -impl crate::shared::ByteProvider - for StreamContentBuilderWithFiller<'_, '_, R> -{ - fn next_byte(&mut self) -> Result, crate::ParseError> { - // If buffer is empty, try to fill it first - if self.content_builder.stream_buffer.is_empty() { - self.content_builder.fill_buffer_from_reader(self.reader)?; - } - - // If still empty after fill attempt, we're at EOF - if self.content_builder.stream_buffer.is_empty() { - // Set finished flag when we reach end of stream - if !*self.finished { - *self.finished = true; - self.content_builder.set_finished(true); - } - return Ok(None); - } - - // Get byte and advance - let byte = self.content_builder.stream_buffer.current_byte()?; - self.content_builder.stream_buffer.advance()?; - Ok(Some(byte)) - } -} - -/// Provider that can fill the buffer from a reader -pub struct StreamContentBuilderWithReader<'a, 'b, R: crate::stream_parser::Reader> { - pub content_builder: &'a mut StreamContentBuilder<'b>, - reader: &'a mut R, - finished: &'a mut bool, -} - -impl StreamContentBuilderWithReader<'_, '_, R> {} - -impl crate::shared::ByteProvider - for StreamContentBuilderWithReader<'_, '_, R> -{ - fn next_byte(&mut self) -> Result, crate::ParseError> { - // If buffer is empty, try to fill it first - if self.content_builder.stream_buffer.is_empty() { - if let Some(fill_slice) = self.content_builder.stream_buffer.get_fill_slice() { - let bytes_read = self - .reader - .read(fill_slice) - .map_err(|_| crate::ParseError::ReaderError)?; - self.content_builder - .stream_buffer - .mark_filled(bytes_read) - .map_err(crate::ParseError::from)?; - } - } - - // If still empty after fill attempt, we're at EOF - if self.content_builder.stream_buffer.is_empty() { - // Set finished flag when we reach end of stream - if !*self.finished { - *self.finished = true; - self.content_builder.set_finished(true); - } - return Ok(None); - } - - // Get byte and advance - let byte = self.content_builder.stream_buffer.current_byte()?; - self.content_builder.stream_buffer.advance()?; - Ok(Some(byte)) - } -} - -impl crate::event_processor::ContentExtractor - for StreamContentBuilderWithReader<'_, '_, R> -{ - fn parser_state_mut(&mut self) -> &mut crate::shared::State { - self.content_builder.parser_state_mut() - } - - fn current_position(&self) -> usize { - self.content_builder.current_position() - } - - fn begin_string_content(&mut self, pos: usize) { - self.content_builder.begin_string_content(pos); - } - - fn unicode_escape_collector_mut( - &mut self, - ) -> &mut crate::escape_processor::UnicodeEscapeCollector { - self.content_builder.unicode_escape_collector_mut() - } - - fn extract_string_content( - &mut self, - start_pos: usize, - ) -> Result, crate::ParseError> { - self.content_builder.extract_string_content(start_pos) - } - - fn extract_key_content( - &mut self, - start_pos: usize, - ) -> Result, crate::ParseError> { - self.content_builder.extract_key_content(start_pos) - } - - fn extract_number_content( - &mut self, - start_pos: usize, - from_container_end: bool, - ) -> Result, crate::ParseError> { - self.content_builder - .extract_number_content(start_pos, from_container_end) - } - - fn extract_number( - &mut self, - start_pos: usize, - from_container_end: bool, - finished: bool, - ) -> Result, crate::ParseError> { - self.content_builder - .extract_number(start_pos, from_container_end, finished) - } -} - -impl crate::event_processor::EscapeHandler - for StreamContentBuilderWithReader<'_, '_, R> -{ - fn parser_state(&self) -> &crate::shared::State { - self.content_builder.parser_state() - } - - fn process_unicode_escape_with_collector(&mut self) -> Result<(), crate::ParseError> { - self.content_builder.process_unicode_escape_with_collector() - } - - fn handle_simple_escape_char(&mut self, escape_char: u8) -> Result<(), crate::ParseError> { - self.content_builder.handle_simple_escape_char(escape_char) - } - - fn begin_escape_sequence(&mut self) -> Result<(), crate::ParseError> { - self.content_builder.begin_escape_sequence() - } - - fn begin_unicode_escape(&mut self) -> Result<(), crate::ParseError> { - self.content_builder.begin_unicode_escape() - } -} - impl StreamContentBuilder<'_> { /// Handle byte accumulation for StreamParser-specific requirements /// This method is called when a byte doesn't generate any events diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index faf78f3..f04885e 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -195,7 +195,6 @@ impl ContentExtractor for StreamParserProvider<'_, R> { &mut self, from_container_end: bool, ) -> Result, ParseError> { - use crate::event_processor::EscapeHandler; let start_pos = match *self.parser_state() { crate::shared::State::Number(pos) => pos, _ => return Err(crate::shared::UnexpectedState::StateMismatch.into()),