diff --git a/picojson/examples/test_surrogate_pairs.rs b/picojson/examples/test_surrogate_pairs.rs new file mode 100644 index 0000000..af90c37 --- /dev/null +++ b/picojson/examples/test_surrogate_pairs.rs @@ -0,0 +1,69 @@ +use picojson::{Event, PullParser, SliceParser}; + +fn main() { + // Test the original failing case: ["\uD801\udc37"] + let json = r#"["\uD801\udc37"]"#; + let mut buffer = [0u8; 1024]; + let mut parser = SliceParser::with_buffer(json, &mut buffer); + + let mut string_found = false; + let mut string_bytes = Vec::new(); + + loop { + match parser.next_event() { + Ok(Event::EndDocument) => break, + Ok(Event::String(s)) => { + string_found = true; + string_bytes = s.as_bytes().to_vec(); + println!("Found string: {:?}", s); + println!("String content as bytes: {:?}", string_bytes); + } + Ok(event) => { + println!("Event: {:?}", event); + } + Err(e) => panic!("Parser error: {:?}", e), + } + } + + println!("Successfully parsed surrogate pair JSON"); + + // The string should be decoded to U+10437 which is encoded as UTF-8: [0xF0, 0x90, 0x90, 0xB7] + if string_found { + assert_eq!( + string_bytes, + vec![0xF0, 0x90, 0x90, 0xB7], + "Surrogate pair should decode to U+10437" + ); + println!("✅ Surrogate pair correctly decoded!"); + } else { + panic!("Expected String event not found"); + } + + // Test additional surrogate pairs + let test_cases = vec![ + (r#"["\uD834\uDD1E"]"#, "Musical G clef symbol"), + (r#"["\uD83D\uDE00"]"#, "Grinning face emoji"), + ]; + + for (json, description) in test_cases { + println!("\nTesting {}: {}", description, json); + let mut buffer = [0u8; 1024]; + let mut parser = SliceParser::with_buffer(json, &mut buffer); + + let mut success = false; + loop { + match parser.next_event() { + Ok(Event::EndDocument) => break, + Ok(Event::String(_)) => success = true, + Ok(_) => {} + Err(e) => panic!("Parser error for {}: {:?}", description, e), + } + } + + if success { + println!("✅ Successfully parsed {}", description); + } else { + panic!("Failed to find string in {}", description); + } + } +} diff --git a/picojson/src/escape_processor.rs b/picojson/src/escape_processor.rs index 3ca8cfd..2b9e513 100644 --- a/picojson/src/escape_processor.rs +++ b/picojson/src/escape_processor.rs @@ -54,8 +54,8 @@ impl EscapeProcessor { /// assert_eq!(EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(), b'\n'); /// ``` pub fn process_escape_token(escape_token: &ujson::EventToken) -> Result { - let escape_char = Self::token_to_escape_char(escape_token) - .ok_or_else(|| UnexpectedState::InvalidEscapeToken)?; + let escape_char = + Self::token_to_escape_char(escape_token).ok_or(UnexpectedState::InvalidEscapeToken)?; Self::process_simple_escape(escape_char) } @@ -102,26 +102,44 @@ impl EscapeProcessor { } } - /// Process a Unicode escape sequence (\uXXXX) and return the UTF-8 encoded bytes. + /// Check if a Unicode codepoint is a high surrogate (0xD800-0xDBFF) + pub fn is_high_surrogate(codepoint: u32) -> bool { + (0xD800..=0xDBFF).contains(&codepoint) + } + + /// Check if a Unicode codepoint is a low surrogate (0xDC00-0xDFFF) + pub fn is_low_surrogate(codepoint: u32) -> bool { + (0xDC00..=0xDFFF).contains(&codepoint) + } + + /// Combine a high and low surrogate pair into a single Unicode codepoint + pub fn combine_surrogate_pair(high: u32, low: u32) -> Result { + if !Self::is_high_surrogate(high) || !Self::is_low_surrogate(low) { + return Err(ParseError::InvalidUnicodeCodepoint); + } + + // Combine surrogates according to UTF-16 specification + let codepoint = 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF); + Ok(codepoint) + } + + /// Process a Unicode escape sequence with surrogate pair support. + /// This function handles both individual Unicode escapes and surrogate pairs. /// /// # Arguments /// * `hex_slice` - A 4-byte slice containing the hexadecimal digits /// * `utf8_buffer` - A buffer to write the UTF-8 encoded result (must be at least 4 bytes) + /// * `pending_high_surrogate` - Optional high surrogate from previous escape /// /// # Returns - /// A slice containing the UTF-8 encoded bytes, or an error if the escape is invalid. - /// - /// # Examples - /// ```ignore - /// // Internal API - see unit tests for usage examples - /// let mut buffer = [0u8; 4]; - /// let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap(); - /// assert_eq!(result, b"A"); - /// ``` + /// A tuple containing: + /// - Optional UTF-8 encoded bytes (None if this is a high surrogate waiting for low) + /// - Optional high surrogate to save for next escape (Some if this is a high surrogate) pub fn process_unicode_escape<'a>( hex_slice: &[u8], utf8_buffer: &'a mut [u8], - ) -> Result<&'a [u8], ParseError> { + pending_high_surrogate: Option, + ) -> Result<(Option<&'a [u8]>, Option), ParseError> { if hex_slice.len() != 4 { return Err(ParseError::InvalidUnicodeHex); } @@ -133,21 +151,48 @@ impl EscapeProcessor { codepoint = (codepoint << 4) | digit; } - // Convert codepoint to character and encode as UTF-8 - let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?; - let utf8_str = ch.encode_utf8(utf8_buffer); - Ok(utf8_str.as_bytes()) + // Check if we have a pending high surrogate + if let Some(high) = pending_high_surrogate { + // We should have a low surrogate now + if Self::is_low_surrogate(codepoint) { + // Combine the surrogate pair + let combined = Self::combine_surrogate_pair(high, codepoint)?; + let ch = char::from_u32(combined).ok_or(ParseError::InvalidUnicodeCodepoint)?; + let utf8_str = ch.encode_utf8(utf8_buffer); + Ok((Some(utf8_str.as_bytes()), None)) + } else { + // Error: high surrogate not followed by low surrogate + Err(ParseError::InvalidUnicodeCodepoint) + } + } else { + // No pending high surrogate + if Self::is_high_surrogate(codepoint) { + // Save this high surrogate for the next escape + Ok((None, Some(codepoint))) + } else if Self::is_low_surrogate(codepoint) { + // Error: low surrogate without preceding high surrogate + Err(ParseError::InvalidUnicodeCodepoint) + } else { + // Regular Unicode character + let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?; + let utf8_str = ch.encode_utf8(utf8_buffer); + Ok((Some(utf8_str.as_bytes()), None)) + } + } } } /// Shared Unicode escape hex digit collector for both parsers. /// Provides a common interface for collecting the 4 hex digits in \uXXXX sequences. +/// Supports surrogate pairs by tracking pending high surrogates. #[derive(Debug)] pub struct UnicodeEscapeCollector { /// Buffer to collect the 4 hex digits hex_buffer: [u8; 4], /// Current position in the hex buffer (0-4) hex_pos: usize, + /// Pending high surrogate waiting for low surrogate + pending_high_surrogate: Option, } impl UnicodeEscapeCollector { @@ -156,12 +201,21 @@ impl UnicodeEscapeCollector { Self { hex_buffer: [0u8; 4], hex_pos: 0, + pending_high_surrogate: None, } } /// Reset the collector for a new Unicode escape sequence pub fn reset(&mut self) { self.hex_pos = 0; + // Note: We don't reset pending_high_surrogate here since it needs to persist + // across Unicode escape sequences to properly handle surrogate pairs + } + + /// Reset the collector completely, including any pending surrogate state + pub fn reset_all(&mut self) { + self.hex_pos = 0; + self.pending_high_surrogate = None; } /// Add a hex digit to the collector @@ -185,14 +239,32 @@ impl UnicodeEscapeCollector { Ok(self.hex_pos == 4) } - /// Process the collected hex digits and return UTF-8 bytes + /// Process the collected hex digits with surrogate pair support /// Should only be called when is_complete() returns true - pub fn process_to_utf8<'a>(&self, utf8_buffer: &'a mut [u8]) -> Result<&'a [u8], ParseError> { + /// Returns (optional UTF-8 bytes, whether surrogate state changed) + pub fn process_to_utf8<'a>( + &mut self, + utf8_buffer: &'a mut [u8], + ) -> Result<(Option<&'a [u8]>, bool), ParseError> { if self.hex_pos != 4 { return Err(UnexpectedState::InvalidUnicodeEscape.into()); } - EscapeProcessor::process_unicode_escape(&self.hex_buffer, utf8_buffer) + let (result, new_pending) = EscapeProcessor::process_unicode_escape( + &self.hex_buffer, + utf8_buffer, + self.pending_high_surrogate, + )?; + + let surrogate_state_changed = self.pending_high_surrogate != new_pending; + self.pending_high_surrogate = new_pending; + + Ok((result, surrogate_state_changed)) + } + + /// Check if there's a pending high surrogate waiting for a low surrogate + pub fn has_pending_high_surrogate(&self) -> bool { + self.pending_high_surrogate.is_some() } } @@ -244,12 +316,16 @@ mod tests { let mut buffer = [0u8; 4]; // Test basic ASCII character \u0041 -> 'A' - let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap(); - assert_eq!(result, b"A"); + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"0041", &mut buffer, None).unwrap(); + assert_eq!(result.unwrap(), b"A"); + assert_eq!(pending, None); // Test another ASCII character \u0048 -> 'H' - let result = EscapeProcessor::process_unicode_escape(b"0048", &mut buffer).unwrap(); - assert_eq!(result, b"H"); + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"0048", &mut buffer, None).unwrap(); + assert_eq!(result.unwrap(), b"H"); + assert_eq!(pending, None); } #[test] @@ -257,13 +333,18 @@ mod tests { let mut buffer = [0u8; 4]; // Test Greek alpha \u03B1 -> 'α' (2 bytes in UTF-8: 0xCE, 0xB1) - let result = EscapeProcessor::process_unicode_escape(b"03B1", &mut buffer).unwrap(); - assert_eq!(result, "α".as_bytes()); + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"03B1", &mut buffer, None).unwrap(); + assert_eq!(result.unwrap(), "α".as_bytes()); + assert_eq!(pending, None); // Test emoji \u1F60A -> '😊' (4 bytes in UTF-8) - let _result = EscapeProcessor::process_unicode_escape(b"1F60", &mut buffer).unwrap(); + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"1F60", &mut buffer, None).unwrap(); // Note: This is actually incomplete - \u1F60A requires surrogate pairs // But for basic testing this verifies the hex parsing works + assert!(result.is_some()); + assert_eq!(pending, None); } #[test] @@ -271,12 +352,12 @@ mod tests { let mut buffer = [0u8; 4]; // Invalid hex characters - assert!(EscapeProcessor::process_unicode_escape(b"00GG", &mut buffer).is_err()); - assert!(EscapeProcessor::process_unicode_escape(b"ZZZZ", &mut buffer).is_err()); + assert!(EscapeProcessor::process_unicode_escape(b"00GG", &mut buffer, None).is_err()); + assert!(EscapeProcessor::process_unicode_escape(b"ZZZZ", &mut buffer, None).is_err()); // Wrong length - assert!(EscapeProcessor::process_unicode_escape(b"123", &mut buffer).is_err()); - assert!(EscapeProcessor::process_unicode_escape(b"12345", &mut buffer).is_err()); + assert!(EscapeProcessor::process_unicode_escape(b"123", &mut buffer, None).is_err()); + assert!(EscapeProcessor::process_unicode_escape(b"12345", &mut buffer, None).is_err()); } #[test] @@ -286,8 +367,10 @@ mod tests { // Note: Most values in the BMP are valid Unicode codepoints // Invalid surrogate codepoints would be D800-DFFF but they're complex to test // For now, test basic valid cases to ensure the function works - let result = EscapeProcessor::process_unicode_escape(b"0000", &mut buffer).unwrap(); - assert_eq!(result, "\0".as_bytes()); + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"0000", &mut buffer, None).unwrap(); + assert_eq!(result.unwrap(), "\0".as_bytes()); + assert_eq!(pending, None); } #[test] @@ -389,8 +472,9 @@ mod tests { assert!(collector.add_hex_digit(b'1').unwrap()); // Complete! // Process to UTF-8 - let result = collector.process_to_utf8(&mut utf8_buffer).unwrap(); - assert_eq!(result, b"A"); + let (result, _surrogate_state_changed) = + collector.process_to_utf8(&mut utf8_buffer).unwrap(); + assert_eq!(result.unwrap(), b"A"); } #[test] @@ -413,13 +497,47 @@ mod tests { assert!(!collector.add_hex_digit(b'0').unwrap()); assert!(!collector.add_hex_digit(b'1').unwrap()); - // Reset should clear state + // Reset should clear hex position but not surrogate state collector.reset(); // Should be able to start fresh assert!(!collector.add_hex_digit(b'A').unwrap()); } + #[test] + fn test_unicode_escape_collector_surrogate_support() { + let mut collector = UnicodeEscapeCollector::new(); + let mut utf8_buffer = [0u8; 4]; + + // Process high surrogate \uD801 + assert!(!collector.add_hex_digit(b'D').unwrap()); + assert!(!collector.add_hex_digit(b'8').unwrap()); + assert!(!collector.add_hex_digit(b'0').unwrap()); + assert!(collector.add_hex_digit(b'1').unwrap()); + + let (result, state_changed) = collector.process_to_utf8(&mut utf8_buffer).unwrap(); + assert_eq!(result, None); // No UTF-8 output yet + assert!(state_changed); // Surrogate state changed + assert!(collector.has_pending_high_surrogate()); + + // Reset for next escape sequence + collector.reset(); + + // Process low surrogate \uDC37 + assert!(!collector.add_hex_digit(b'D').unwrap()); + assert!(!collector.add_hex_digit(b'C').unwrap()); + assert!(!collector.add_hex_digit(b'3').unwrap()); + assert!(collector.add_hex_digit(b'7').unwrap()); + + let (result, state_changed) = collector.process_to_utf8(&mut utf8_buffer).unwrap(); + assert!(result.is_some()); // Should have UTF-8 output + assert!(state_changed); // Surrogate state changed (cleared) + assert!(!collector.has_pending_high_surrogate()); + + // Verify it's the correct UTF-8 encoding for U+10437 + assert_eq!(result.unwrap(), [0xF0, 0x90, 0x90, 0xB7]); + } + #[test] fn test_unicode_escape_collector_multibyte() { let mut collector = UnicodeEscapeCollector::new(); @@ -431,8 +549,9 @@ mod tests { assert!(!collector.add_hex_digit(b'B').unwrap()); assert!(collector.add_hex_digit(b'1').unwrap()); - let result = collector.process_to_utf8(&mut utf8_buffer).unwrap(); - assert_eq!(result, "α".as_bytes()); + let (result, _surrogate_state_changed) = + collector.process_to_utf8(&mut utf8_buffer).unwrap(); + assert_eq!(result.unwrap(), "α".as_bytes()); } #[test] @@ -447,9 +566,80 @@ mod tests { // Should fail to process incomplete sequence assert!(collector.process_to_utf8(&mut utf8_buffer).is_err()); } + + #[test] + fn test_surrogate_pair_detection() { + // Test high surrogate detection + assert!(EscapeProcessor::is_high_surrogate(0xD800)); + assert!(EscapeProcessor::is_high_surrogate(0xD801)); + assert!(EscapeProcessor::is_high_surrogate(0xDBFF)); + assert!(!EscapeProcessor::is_high_surrogate(0xD7FF)); + assert!(!EscapeProcessor::is_high_surrogate(0xDC00)); + + // Test low surrogate detection + assert!(EscapeProcessor::is_low_surrogate(0xDC00)); + assert!(EscapeProcessor::is_low_surrogate(0xDC37)); + assert!(EscapeProcessor::is_low_surrogate(0xDFFF)); + assert!(!EscapeProcessor::is_low_surrogate(0xDBFF)); + assert!(!EscapeProcessor::is_low_surrogate(0xE000)); + } + + #[test] + fn test_surrogate_pair_combination() { + // Test valid surrogate pair: \uD801\uDC37 -> U+10437 + let combined = EscapeProcessor::combine_surrogate_pair(0xD801, 0xDC37).unwrap(); + assert_eq!(combined, 0x10437); + + // Test another valid pair: \uD834\uDD1E -> U+1D11E (musical symbol) + let combined = EscapeProcessor::combine_surrogate_pair(0xD834, 0xDD1E).unwrap(); + assert_eq!(combined, 0x1D11E); + + // Test invalid combinations + assert!(EscapeProcessor::combine_surrogate_pair(0x0041, 0xDC37).is_err()); // Not high surrogate + assert!(EscapeProcessor::combine_surrogate_pair(0xD801, 0x0041).is_err()); + // Not low surrogate + } + + #[test] + fn test_unicode_escape_with_surrogate_support() { + let mut buffer = [0u8; 4]; + + // Test regular Unicode character (not surrogate) + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"0041", &mut buffer, None).unwrap(); + assert_eq!(result, Some(b"A".as_slice())); + assert_eq!(pending, None); + + // Test high surrogate - should return None and save the high surrogate + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"D801", &mut buffer, None).unwrap(); + assert_eq!(result, None); + assert_eq!(pending, Some(0xD801)); + + // Test low surrogate following high surrogate - should combine + let (result, pending) = + EscapeProcessor::process_unicode_escape(b"DC37", &mut buffer, Some(0xD801)).unwrap(); + assert!(result.is_some()); + assert_eq!(pending, None); + // The result should be the UTF-8 encoding of U+10437 + assert_eq!(result.unwrap(), [0xF0, 0x90, 0x90, 0xB7]); + } + + #[test] + fn test_unicode_escape_surrogate_error_cases() { + let mut buffer = [0u8; 4]; + + // Test low surrogate without preceding high surrogate - should error + let result = EscapeProcessor::process_unicode_escape(b"DC37", &mut buffer, None); + assert!(result.is_err()); + + // Test high surrogate followed by non-low-surrogate - should error + let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer, Some(0xD801)); + assert!(result.is_err()); + } } -/// Shared implementation for processing a Unicode escape sequence. +/// Shared implementation for processing a Unicode escape sequence WITH surrogate pair support. /// /// This function centralizes the logic for handling `\uXXXX` escapes, which is /// common to both the pull-based and stream-based parsers. It uses a generic @@ -463,13 +653,15 @@ mod tests { /// * `utf8_buf` - A buffer to write the UTF-8 encoded result into. /// /// # Returns -/// A tuple containing the resulting UTF-8 byte slice and the start position of the escape sequence (`\uXXXX`). +/// A tuple containing: +/// - Optional UTF-8 byte slice (None if this is a high surrogate waiting for low surrogate) +/// - The start position of the escape sequence (`\uXXXX`) pub(crate) fn process_unicode_escape_sequence<'a, F>( current_pos: usize, unicode_escape_collector: &mut UnicodeEscapeCollector, mut hex_slice_provider: F, utf8_buf: &'a mut [u8; 4], -) -> Result<(&'a [u8], usize), ParseError> +) -> Result<(Option<&'a [u8]>, usize), ParseError> where F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>, { @@ -488,8 +680,21 @@ where unicode_escape_collector.add_hex_digit(hex_digit)?; } - // Process the complete sequence to UTF-8 - let utf8_bytes = unicode_escape_collector.process_to_utf8(utf8_buf)?; + // Check if we had a pending high surrogate before processing + let had_pending_high_surrogate = unicode_escape_collector.has_pending_high_surrogate(); + + // Process the complete sequence to UTF-8 with surrogate support + let (utf8_bytes_opt, _surrogate_state_changed) = + unicode_escape_collector.process_to_utf8(utf8_buf)?; + + // If we're completing a surrogate pair (had pending high surrogate and now have UTF-8 bytes), + // return the position of the high surrogate start instead of the low surrogate start + let final_escape_start_pos = if had_pending_high_surrogate && utf8_bytes_opt.is_some() { + // High surrogate started 6 bytes before the current low surrogate + escape_start_pos.saturating_sub(6) + } else { + escape_start_pos + }; - Ok((utf8_bytes, escape_start_pos)) + Ok((utf8_bytes_opt, final_escape_start_pos)) } diff --git a/picojson/src/slice_parser.rs b/picojson/src/slice_parser.rs index eefba7d..9e5c8ba 100644 --- a/picojson/src/slice_parser.rs +++ b/picojson/src/slice_parser.rs @@ -176,7 +176,7 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { start: usize, from_container_end: bool, ) -> Result, ParseError> { - // SliceParser delimiter logic (same as the legacy wrapper) + // SliceParser delimiter logic let current_pos = self.buffer.current_position(); let end_pos = if !from_container_end && self.buffer.is_empty() { // At EOF - use full span @@ -207,6 +207,10 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { &mut self, escape_char: u8, ) -> Result>, ParseError> { + // Clear any pending high surrogate state when we encounter a simple escape + // This ensures that interrupted surrogate pairs (like \uD801\n\uDC37) are properly rejected + self.unicode_escape_collector.reset_all(); + if let State::String(_) | State::Key(_) = self.parser_state.state { self.copy_on_escape .handle_escape(self.buffer.current_pos(), escape_char)?; @@ -220,8 +224,11 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { let current_pos = self.buffer.current_pos(); let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into); + // Check if we had a pending high surrogate before processing + let had_pending_high_surrogate = self.unicode_escape_collector.has_pending_high_surrogate(); + let mut utf8_buf = [0u8; 4]; - let (utf8_bytes, escape_start_pos) = + let (utf8_bytes_opt, escape_start_pos) = crate::escape_processor::process_unicode_escape_sequence( current_pos, &mut self.unicode_escape_collector, @@ -229,8 +236,24 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { &mut utf8_buf, )?; - self.copy_on_escape - .handle_unicode_escape(escape_start_pos, utf8_bytes) + // Only process UTF-8 bytes if we have them (not a high surrogate waiting for low surrogate) + if let Some(utf8_bytes) = utf8_bytes_opt { + if had_pending_high_surrogate { + // This is completing a surrogate pair - need to consume both escapes + // First call: consume the high surrogate (6 bytes earlier) + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, &[])?; + // Second call: consume the low surrogate and write UTF-8 + self.copy_on_escape + .handle_unicode_escape(escape_start_pos + 6, utf8_bytes)?; + } else { + // Single Unicode escape - normal processing + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + } + } + + Ok(()) } fn pull_tokenizer_events(&mut self) -> Result<(), ParseError> { @@ -406,6 +429,10 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { EventResult::Continue => continue, EventResult::ExtractKey => { if let State::Key(_start) = self.parser_state.state { + // Check for incomplete surrogate pairs before ending the key + if self.unicode_escape_collector.has_pending_high_surrogate() { + break Err(ParseError::InvalidUnicodeCodepoint); + } self.parser_state.state = State::None; // Use CopyOnEscape to get the final key result let end_pos = ContentRange::end_position_excluding_delimiter( @@ -419,6 +446,10 @@ impl<'a, 'b, C: BitStackConfig> SliceParser<'a, 'b, C> { } EventResult::ExtractString => { if let State::String(_value) = self.parser_state.state { + // Check for incomplete surrogate pairs before ending the string + if self.unicode_escape_collector.has_pending_high_surrogate() { + break Err(ParseError::InvalidUnicodeCodepoint); + } self.parser_state.state = State::None; // Use CopyOnEscape to get the final string result let end_pos = ContentRange::end_position_excluding_delimiter( diff --git a/picojson/src/stream_buffer.rs b/picojson/src/stream_buffer.rs index feb9f58..ead407d 100644 --- a/picojson/src/stream_buffer.rs +++ b/picojson/src/stream_buffer.rs @@ -61,14 +61,11 @@ impl<'a> StreamBuffer<'a> { }; for i in iterator { - match ( + if let (Some(src_byte), Some(dest_slot)) = ( self.buffer.get(src_start.wrapping_add(i)).copied(), self.buffer.get_mut(dest.wrapping_add(i)), ) { - (Some(src_byte), Some(dest_slot)) => { - *dest_slot = src_byte; - } - _ => {} + *dest_slot = src_byte; } } } @@ -141,7 +138,7 @@ impl<'a> StreamBuffer<'a> { let remaining_data = self.data_end.saturating_sub(start_offset); // Copy existing content if there is any - EXACT same pattern as start_unescaping_with_copy - if self.data_end > start_offset && start_offset < self.data_end { + if self.data_end > start_offset { let span_len = remaining_data; // Ensure the span fits in the buffer - return error instead of silent truncation diff --git a/picojson/src/stream_parser.rs b/picojson/src/stream_parser.rs index c991a89..dfa7344 100644 --- a/picojson/src/stream_parser.rs +++ b/picojson/src/stream_parser.rs @@ -329,6 +329,11 @@ impl StreamParser<'_, R, C> { return Err(UnexpectedState::StateMismatch.into()); }; + // Check for incomplete surrogate pairs before ending the string + if self.unicode_escape_collector.has_pending_high_surrogate() { + return Err(ParseError::InvalidUnicodeCodepoint); + } + self.parser_state.state = crate::shared::State::None; if self.stream_buffer.has_unescaped_content() { @@ -368,6 +373,11 @@ impl StreamParser<'_, R, C> { return Err(UnexpectedState::StateMismatch.into()); }; + // Check for incomplete surrogate pairs before ending the key + if self.unicode_escape_collector.has_pending_high_surrogate() { + return Err(ParseError::InvalidUnicodeCodepoint); + } + self.parser_state.state = crate::shared::State::None; if self.stream_buffer.has_unescaped_content() { @@ -451,10 +461,7 @@ impl StreamParser<'_, R, C> { crate::shared::State::Number(start_pos) => start_pos, crate::shared::State::Key(start_pos) => start_pos, crate::shared::State::String(start_pos) => start_pos, - _ => { - let pos = self.stream_buffer.current_position(); - pos - } + _ => self.stream_buffer.current_position(), }; let offset = self.stream_buffer.compact_from(compact_start_pos)?; @@ -607,7 +614,12 @@ impl StreamParser<'_, R, C> { }; // Normal byte accumulation - all escape processing now goes through event system - if !in_escape && self.stream_buffer.has_unescaped_content() { + // Skip writing bytes to escape buffer when we have a pending high surrogate + // (prevents literal \uD801 text from being included in final string) + if !in_escape + && self.stream_buffer.has_unescaped_content() + && !self.unicode_escape_collector.has_pending_high_surrogate() + { self.append_byte_to_escape_buffer(byte)?; } } @@ -636,7 +648,12 @@ impl StreamParser<'_, R, C> { // With content tracking, start_pos is the content_start let content_start = start_pos; // Content to copy ends right before the escape character - let content_end = ContentRange::end_position_excluding_delimiter(current_pos); + let content_end = if self.unicode_escape_collector.has_pending_high_surrogate() { + // Skip copying high surrogate text when processing low surrogate + content_start + } else { + ContentRange::end_position_excluding_delimiter(current_pos) + }; // Estimate max length needed for unescaping (content so far + remaining buffer) let content_len = content_end.wrapping_sub(content_start); @@ -660,6 +677,10 @@ impl StreamParser<'_, R, C> { /// Handle simple escape sequence using unified EscapeProcessor fn handle_simple_escape(&mut self, escape_token: &EventToken) -> Result<(), ParseError> { + // Clear any pending high surrogate state when we encounter a simple escape + // This ensures that interrupted surrogate pairs (like \uD801\n\uDC37) are properly rejected + self.unicode_escape_collector.reset_all(); + // Update escape state in enum if let ProcessingState::Active { ref mut in_escape_sequence, @@ -697,19 +718,24 @@ impl StreamParser<'_, R, C> { }; let mut utf8_buf = [0u8; 4]; - let (utf8_bytes, _escape_start_pos) = + let (utf8_bytes_opt, _escape_start_pos) = crate::escape_processor::process_unicode_escape_sequence( current_pos, &mut self.unicode_escape_collector, hex_slice_provider, &mut utf8_buf, )?; - let mut copy = [0u8; 4]; - let len = utf8_bytes.len(); - if let Some(dest) = copy.get_mut(..len) { - dest.copy_from_slice(utf8_bytes); + if let Some(utf8_bytes) = utf8_bytes_opt { + let mut copy = [0u8; 4]; + let len = utf8_bytes.len(); + if let Some(dest) = copy.get_mut(..len) { + dest.copy_from_slice(utf8_bytes); + } + (copy, len) + } else { + // High surrogate waiting for low surrogate - no UTF-8 bytes to process + ([0u8; 4], 0) } - (copy, len) }; if let Some(bytes_to_copy) = utf8_bytes_copy.0.get(..utf8_bytes_copy.1) { @@ -1779,4 +1805,47 @@ mod tests { // Buffer size 24 - known working boundary from stress tests test_simple_escape_with_buffer_size(24).expect("24-byte buffer should definitely work"); } + + #[test] + fn test_surrogate_pair_buffer_boundary_cases() { + // Test 7: Surrogate pair split across very small buffer chunks + let input7 = r#"["\uD801\uDC37"]"#; + let mut buffer7 = [0u8; 16]; // Small buffer + let reader7 = crate::chunk_reader::ChunkReader::new(input7.as_bytes(), 3); // Tiny chunks + let mut parser7 = StreamParser::<_, DefaultConfig>::with_config(reader7, &mut buffer7); + assert_eq!(parser7.next_event(), Ok(Event::StartArray)); + match parser7.next_event() { + Ok(Event::String(s)) => { + let content = match s { + crate::String::Borrowed(c) => c, + crate::String::Unescaped(c) => c, + }; + assert_eq!(content, "𐐷"); + } + other => panic!( + "Expected String with surrogate pair across buffer boundary, got: {:?}", + other + ), + } + + // Test 8: Surrogate pair with small buffer (still needs minimum space) + let input8 = r#"["\uD801\uDC37"]"#; + let mut buffer8 = [0u8; 32]; // Small but sufficient buffer + let reader8 = crate::chunk_reader::ChunkReader::new(input8.as_bytes(), 6); // Split at surrogate boundary + let mut parser8 = StreamParser::<_, DefaultConfig>::with_config(reader8, &mut buffer8); + assert_eq!(parser8.next_event(), Ok(Event::StartArray)); + match parser8.next_event() { + Ok(Event::String(s)) => { + let content = match s { + crate::String::Borrowed(c) => c, + crate::String::Unescaped(c) => c, + }; + assert_eq!(content, "𐐷"); + } + other => panic!( + "Expected String with surrogate pair at small buffer, got: {:?}", + other + ), + } + } } diff --git a/picojson/tests/surrogate_pairs.rs b/picojson/tests/surrogate_pairs.rs new file mode 100644 index 0000000..80261fa --- /dev/null +++ b/picojson/tests/surrogate_pairs.rs @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Shared surrogate pair tests for both SliceParser and StreamParser +//! +//! This module consolidates surrogate pair testing logic to ensure both parsers +//! handle UTF-16 surrogate pairs identically across different configurations. + +use picojson::{ + ChunkReader, DefaultConfig, Event, PullParser, SliceParser, StreamParser, String as JsonString, +}; + +/// Test fixture that runs a JSON input against any PullParser implementation +/// and verifies the expected sequence of events. +fn test_fixture(mut parser: P, expected_events: &[Event]) { + for (i, expected) in expected_events.iter().enumerate() { + let actual = parser + .next_event() + .unwrap_or_else(|e| panic!("Parser error at event {}: {:?}", i, e)); + + match (expected, &actual) { + // Handle string content comparison + (Event::String(expected_str), Event::String(actual_str)) => { + assert_eq!( + expected_str.as_ref(), + actual_str.as_ref(), + "String content mismatch at event {}", + i + ); + } + (Event::Key(expected_key), Event::Key(actual_key)) => { + assert_eq!( + expected_key.as_ref(), + actual_key.as_ref(), + "Key content mismatch at event {}", + i + ); + } + _ => { + assert_eq!( + std::mem::discriminant(expected), + std::mem::discriminant(&actual), + "Event type mismatch at event {}: expected {:?}, got {:?}", + i, + expected, + actual + ); + } + } + } +} + +/// Test fixture for cases that should error during parsing +fn test_error_fixture(mut parser: P, error_description: &str) { + // Parse until we find the first event that should cause an error + loop { + match parser.next_event() { + Ok(Event::EndDocument) => { + panic!( + "{}: Expected error but parsing completed successfully", + error_description + ); + } + Ok(_) => continue, // Keep parsing until we hit the error + Err(_) => return, // Got expected error + } + } +} + +/// Create a SliceParser for testing +fn create_slice_parser<'a>(input: &'a str, scratch_buffer: &'a mut [u8]) -> SliceParser<'a, 'a> { + scratch_buffer.fill(0); // Clear the buffer + SliceParser::with_buffer(input, scratch_buffer) +} + +/// Create a StreamParser with full-size chunks for testing +fn create_stream_parser_full<'a>( + input: &'a str, + buffer: &'a mut [u8], +) -> StreamParser<'a, ChunkReader<'a>, DefaultConfig> { + buffer.fill(0); // Clear the buffer + let reader = ChunkReader::new(input.as_bytes(), input.len()); + StreamParser::with_config(reader, buffer) +} + +/// Create a StreamParser with small chunks to test buffer boundaries +fn create_stream_parser_chunked<'a>( + input: &'a str, + chunk_size: usize, + buffer: &'a mut [u8], +) -> StreamParser<'a, ChunkReader<'a>, DefaultConfig> { + buffer.fill(0); // Clear the buffer + let reader = ChunkReader::new(input.as_bytes(), chunk_size); + StreamParser::with_config(reader, buffer) +} + +#[test] +fn test_basic_surrogate_pair() { + let input = r#"["\uD801\uDC37"]"#; + let expected = [ + Event::StartArray, + Event::String(JsonString::Unescaped("𐐷")), + Event::EndArray, + Event::EndDocument, + ]; + + // Test all parser configurations + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_fixture(create_slice_parser(input, &mut scratch), &expected); + test_fixture(create_stream_parser_full(input, &mut buffer), &expected); + test_fixture( + create_stream_parser_chunked(input, 8, &mut buffer), + &expected, + ); + test_fixture( + create_stream_parser_chunked(input, 3, &mut buffer), + &expected, + ); // Very small chunks +} + +#[test] +fn test_musical_clef_surrogate_pair() { + let input = r#"["\uD834\uDD1E"]"#; + let expected = [ + Event::StartArray, + Event::String(JsonString::Unescaped("𝄞")), + Event::EndArray, + Event::EndDocument, + ]; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_fixture(create_slice_parser(input, &mut scratch), &expected); + test_fixture(create_stream_parser_full(input, &mut buffer), &expected); + test_fixture( + create_stream_parser_chunked(input, 6, &mut buffer), + &expected, + ); +} + +#[test] +fn test_multiple_surrogate_pairs() { + let input = r#"["\uD801\uDC37\uD834\uDD1E"]"#; + let expected = [ + Event::StartArray, + Event::String(JsonString::Unescaped("𐐷𝄞")), + Event::EndArray, + Event::EndDocument, + ]; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_fixture(create_slice_parser(input, &mut scratch), &expected); + test_fixture(create_stream_parser_full(input, &mut buffer), &expected); + test_fixture( + create_stream_parser_chunked(input, 10, &mut buffer), + &expected, + ); +} + +#[test] +fn test_surrogate_pairs_in_object_keys() { + let input = r#"{"\uD801\uDC37": "value"}"#; + let expected = [ + Event::StartObject, + Event::Key(JsonString::Unescaped("𐐷")), + Event::String(JsonString::Borrowed("value")), + Event::EndObject, + Event::EndDocument, + ]; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_fixture(create_slice_parser(input, &mut scratch), &expected); + test_fixture(create_stream_parser_full(input, &mut buffer), &expected); + test_fixture( + create_stream_parser_chunked(input, 7, &mut buffer), + &expected, + ); +} + +#[test] +fn test_mixed_content_with_surrogate_pairs() { + let input = r#"{"text": "Hello \uD801\uDC37 World"}"#; + let expected = [ + Event::StartObject, + Event::Key(JsonString::Borrowed("text")), + Event::String(JsonString::Unescaped("Hello 𐐷 World")), + Event::EndObject, + Event::EndDocument, + ]; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_fixture(create_slice_parser(input, &mut scratch), &expected); + test_fixture(create_stream_parser_full(input, &mut buffer), &expected); + test_fixture( + create_stream_parser_chunked(input, 5, &mut buffer), + &expected, + ); +} + +#[test] +fn test_edge_of_surrogate_ranges() { + // D7FF is just below high surrogate range, E000 is just above low surrogate range + let input = r#"["\uD7FF\uE000"]"#; + let expected = [ + Event::StartArray, + Event::String(JsonString::Unescaped("\u{D7FF}\u{E000}")), // Two separate characters: \uD7FF + \uE000 + Event::EndArray, + Event::EndDocument, + ]; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_fixture(create_slice_parser(input, &mut scratch), &expected); + test_fixture(create_stream_parser_full(input, &mut buffer), &expected); + test_fixture( + create_stream_parser_chunked(input, 4, &mut buffer), + &expected, + ); +} + +// Error cases + +#[test] +fn test_lone_low_surrogate_error() { + let input = r#"["\uDC37"]"#; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_error_fixture( + create_slice_parser(input, &mut scratch), + "Lone low surrogate", + ); + test_error_fixture( + create_stream_parser_full(input, &mut buffer), + "Lone low surrogate", + ); + test_error_fixture( + create_stream_parser_chunked(input, 5, &mut buffer), + "Lone low surrogate", + ); +} + +#[test] +fn test_high_surrogate_followed_by_non_low_surrogate_error() { + let input = r#"["\uD801\u0041"]"#; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_error_fixture( + create_slice_parser(input, &mut scratch), + "High surrogate + non-low surrogate", + ); + test_error_fixture( + create_stream_parser_full(input, &mut buffer), + "High surrogate + non-low surrogate", + ); + test_error_fixture( + create_stream_parser_chunked(input, 6, &mut buffer), + "High surrogate + non-low surrogate", + ); +} + +#[test] +fn test_double_high_surrogate_error() { + let input = r#"["\uD801\uD802"]"#; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_error_fixture( + create_slice_parser(input, &mut scratch), + "High surrogate + high surrogate", + ); + test_error_fixture( + create_stream_parser_full(input, &mut buffer), + "High surrogate + high surrogate", + ); + test_error_fixture( + create_stream_parser_chunked(input, 8, &mut buffer), + "High surrogate + high surrogate", + ); +} + +#[test] +fn test_interrupted_surrogate_pair_error() { + // This is the critical test case that was previously failing + // \n should clear the pending high surrogate, making \uDC37 a lone low surrogate + let input = r#"["\uD801\n\uDC37"]"#; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_error_fixture( + create_slice_parser(input, &mut scratch), + "Interrupted surrogate pair", + ); + test_error_fixture( + create_stream_parser_full(input, &mut buffer), + "Interrupted surrogate pair", + ); + test_error_fixture( + create_stream_parser_chunked(input, 4, &mut buffer), + "Interrupted surrogate pair", + ); +} + +#[test] +fn test_various_escape_interruptions() { + // Test different types of simple escapes that should clear surrogate state + let test_cases = [ + (r#"["\uD801\t\uDC37"]"#, "Tab escape interruption"), + (r#"["\uD801\r\uDC37"]"#, "Carriage return interruption"), + (r#"["\uD801\\\uDC37"]"#, "Backslash escape interruption"), + (r#"["\uD801\"\uDC37"]"#, "Quote escape interruption"), + ]; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + for (input, description) in &test_cases { + test_error_fixture(create_slice_parser(input, &mut scratch), description); + test_error_fixture(create_stream_parser_full(input, &mut buffer), description); + test_error_fixture( + create_stream_parser_chunked(input, 3, &mut buffer), + description, + ); + } +} + +// Buffer boundary specific tests for StreamParser + +#[test] +fn test_surrogate_pair_across_chunk_boundaries() { + let input = r#"["\uD801\uDC37"]"#; + let expected = [ + Event::StartArray, + Event::String(JsonString::Unescaped("𐐷")), + Event::EndArray, + Event::EndDocument, + ]; + + let mut buffer = [0u8; 1024]; + + // Test with chunk boundaries that split the surrogate pair + test_fixture( + create_stream_parser_chunked(input, 1, &mut buffer), + &expected, + ); // Every byte + test_fixture( + create_stream_parser_chunked(input, 7, &mut buffer), + &expected, + ); // Split between surrogates + test_fixture( + create_stream_parser_chunked(input, 11, &mut buffer), + &expected, + ); // Split within low surrogate +} + +#[test] +fn test_very_small_buffers() { + let input = r#"["\uD801\uDC37"]"#; + let expected = [ + Event::StartArray, + Event::String(JsonString::Unescaped("𐐷")), + Event::EndArray, + Event::EndDocument, + ]; + + let mut buffer = [0u8; 1024]; + + // Test with extremely small chunks to stress buffer management + test_fixture( + create_stream_parser_chunked(input, 1, &mut buffer), + &expected, + ); + test_fixture( + create_stream_parser_chunked(input, 2, &mut buffer), + &expected, + ); +} + +#[test] +fn test_pathological_cases() { + // High surrogate at end of string (should error) + let input1 = r#"["\uD801"]"#; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_error_fixture( + create_slice_parser(input1, &mut scratch), + "High surrogate at end of string", + ); + test_error_fixture( + create_stream_parser_full(input1, &mut buffer), + "High surrogate at end of string", + ); + test_error_fixture( + create_stream_parser_chunked(input1, 3, &mut buffer), + "High surrogate at end of string", + ); +} + +#[test] +fn test_complex_nested_structures() { + let input = r#"{"users": [{"name": "\uD801\uDC37", "emoji": "\uD834\uDD1E"}]}"#; + let expected = [ + Event::StartObject, + Event::Key(JsonString::Borrowed("users")), + Event::StartArray, + Event::StartObject, + Event::Key(JsonString::Borrowed("name")), + Event::String(JsonString::Unescaped("𐐷")), + Event::Key(JsonString::Borrowed("emoji")), + Event::String(JsonString::Unescaped("𝄞")), + Event::EndObject, + Event::EndArray, + Event::EndObject, + Event::EndDocument, + ]; + + let mut scratch = [0u8; 1024]; + let mut buffer = [0u8; 1024]; + + test_fixture(create_slice_parser(input, &mut scratch), &expected); + test_fixture(create_stream_parser_full(input, &mut buffer), &expected); + test_fixture( + create_stream_parser_chunked(input, 12, &mut buffer), + &expected, + ); +}