Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions picojson/src/escape_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,3 +444,48 @@ mod tests {
assert!(collector.process_to_utf8(&mut utf8_buffer).is_err());
}
}

/// Shared implementation for processing a Unicode escape sequence.
///
/// This function centralizes the logic for handling `\uXXXX` escapes, which is
/// common to both the pull-based and stream-based parsers. It uses a generic
/// `hex_slice_provider` to remain independent of the underlying buffer implementation
/// (`SliceInputBuffer` vs. `DirectBuffer`).
///
/// # Arguments
/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits.
/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`.
/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice.
/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into.
///
/// # Returns
/// A tuple containing the resulting UTF-8 byte slice and the start position of the escape sequence (`\uXXXX`).
pub(crate) fn process_unicode_escape_sequence<'a, F>(
current_pos: usize,
unicode_escape_collector: &mut UnicodeEscapeCollector,
mut hex_slice_provider: F,
utf8_buf: &'a mut [u8; 4],
) -> Result<(&'a [u8], usize), ParseError>
where
F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>,
{
let (hex_start, hex_end, escape_start_pos) =
crate::shared::ContentRange::unicode_escape_bounds(current_pos);

// Extract the 4 hex digits from the buffer using the provider
let hex_slice = hex_slice_provider(hex_start, hex_end)?;

if hex_slice.len() != 4 {
return Err(ParserErrorHandler::invalid_unicode_length());
}

// Feed hex digits to the shared collector
for &hex_digit in hex_slice {
unicode_escape_collector.add_hex_digit(hex_digit)?;
}

// Process the complete sequence to UTF-8
let utf8_bytes = unicode_escape_collector.process_to_utf8(utf8_buf)?;

Ok((utf8_bytes, escape_start_pos))
}
32 changes: 9 additions & 23 deletions picojson/src/pull_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,34 +153,20 @@ impl<'a, 'b, C: BitStackConfig> PullParser<'a, 'b, C> {
/// Process Unicode escape sequence using shared UnicodeEscapeCollector
/// Extracts hex digits from buffer and processes them through the collector
fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> {
// Current position is right after the 4 hex digits
let current_pos = self.buffer.current_pos();
let (hex_start, hex_end, escape_start_pos) =
ContentRange::unicode_escape_bounds(current_pos);
let hex_slice_provider = |start, end| self.buffer.slice(start, end).map_err(Into::into);

// Extract the 4 hex digits from buffer
let hex_slice = self.buffer.slice(hex_start, hex_end)?;

if hex_slice.len() != 4 {
return Err(ParserErrorHandler::invalid_unicode_length());
}

// Feed hex digits to the shared collector
for &hex_digit in hex_slice {
self.unicode_escape_collector.add_hex_digit(hex_digit)?;
}

// Process the complete sequence to UTF-8
let mut utf8_buf = [0u8; 4];
let utf8_bytes = self
.unicode_escape_collector
.process_to_utf8(&mut utf8_buf)?;
let (utf8_bytes, escape_start_pos) =
crate::escape_processor::process_unicode_escape_sequence(
current_pos,
&mut self.unicode_escape_collector,
hex_slice_provider,
&mut utf8_buf,
)?;

// Handle the Unicode escape via CopyOnEscape
self.copy_on_escape
.handle_unicode_escape(escape_start_pos, utf8_bytes)?;

Ok(())
.handle_unicode_escape(escape_start_pos, utf8_bytes)
}

fn pull_tokenizer_events(&mut self) -> Result<(), ParseError> {
Expand Down
45 changes: 21 additions & 24 deletions picojson/src/stream_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -535,34 +535,31 @@ impl<'b, R: Reader, C: BitStackConfig> StreamParser<'b, R, C> {
*in_escape_sequence = false;
}

// Current position is right after the 4 hex digits
let current_pos = self.direct_buffer.current_position();
let (hex_start, hex_end, _escape_start_pos) =
ContentRange::unicode_escape_bounds(current_pos);

// Extract the 4 hex digits from buffer
let hex_slice = self.direct_buffer.get_string_slice(hex_start, hex_end)?;

if hex_slice.len() != 4 {
return Err(ParserErrorHandler::invalid_unicode_length());
}

// Feed hex digits to the shared collector
for &hex_digit in hex_slice {
self.unicode_escape_collector.add_hex_digit(hex_digit)?;
}
let utf8_bytes_copy = {
let current_pos = self.direct_buffer.current_position();
let hex_slice_provider = |start, end| {
self.direct_buffer
.get_string_slice(start, end)
.map_err(Into::into)
};

// Process the complete sequence to UTF-8
let mut utf8_buf = [0u8; 4];
let utf8_bytes = self
.unicode_escape_collector
.process_to_utf8(&mut utf8_buf)?;
let mut utf8_buf = [0u8; 4];
let (utf8_bytes, _escape_start_pos) =
crate::escape_processor::process_unicode_escape_sequence(
current_pos,
&mut self.unicode_escape_collector,
hex_slice_provider,
&mut utf8_buf,
)?;
let mut copy = [0u8; 4];
let len = utf8_bytes.len();
copy[..len].copy_from_slice(utf8_bytes);
(copy, len)
};

// Handle the Unicode escape via DirectBuffer escape processing
for &byte in utf8_bytes {
for &byte in &utf8_bytes_copy.0[..utf8_bytes_copy.1] {
self.append_byte_to_escape_buffer(byte)?;
}

Ok(())
}

Expand Down
Loading