Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions picojson/src/copy_on_escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,18 @@ impl<'a, 'b> CopyOnEscape<'a, 'b> {
Ok(String::Borrowed(borrowed_str))
}
}

/// DataSource support methods - check if unescaped content is available
pub fn has_unescaped_content(&self) -> bool {
self.using_scratch
}

/// Direct access to scratch buffer with proper lifetime for DataSource implementation
pub fn get_scratch_contents(&'b self) -> Result<&'b [u8], ParseError> {
self.scratch
.get(self.scratch_start..self.scratch_pos)
.ok_or(ParseError::Unexpected(UnexpectedState::InvalidSliceBounds))
}
}

#[cfg(test)]
Expand Down
67 changes: 47 additions & 20 deletions picojson/src/escape_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
use crate::parse_error::ParseError;
use crate::shared::{ContentRange, UnexpectedState};

/// Result type for Unicode escape sequence processing.
///
/// Tuple contains:
/// - Optional UTF-8 byte array and its length
/// - The start position of the escape sequence (\uXXXX)
/// - The new pending high surrogate value, if any
type UnicodeEscapeResult = (Option<([u8; 4], usize)>, usize, Option<u32>);

/// Shared utilities for processing JSON escape sequences.
/// This module contains pure functions for escape processing that can be used
/// by both CopyOnEscape and StreamingBuffer components.
Expand Down Expand Up @@ -266,6 +274,16 @@ impl UnicodeEscapeCollector {
pub fn has_pending_high_surrogate(&self) -> bool {
self.pending_high_surrogate.is_some()
}

/// Get the pending high surrogate value
pub fn get_pending_high_surrogate(&self) -> Option<u32> {
self.pending_high_surrogate
}

/// Set the pending high surrogate value
pub fn set_pending_high_surrogate(&mut self, surrogate: Option<u32>) {
self.pending_high_surrogate = surrogate;
}
}

impl Default for UnicodeEscapeCollector {
Expand Down Expand Up @@ -645,51 +663,56 @@ mod tests {
/// Shared implementation for processing a Unicode escape sequence WITH surrogate pair support.
///
/// This function centralizes the logic for handling `\uXXXX` escapes, which is
/// common to both the pull-based and stream-based parsers. It uses a generic
/// `hex_slice_provider` to remain independent of the underlying buffer implementation
/// (`SliceInputBuffer` vs. `StreamBuffer`).
/// common to all parsers. It uses the generic `DataSource` trait to remain
/// independent of the underlying buffer implementation (`SliceInputBuffer` vs. `StreamBuffer`).
///
/// # Arguments
/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits.
/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`.
/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice.
/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into.
/// * `current_pos` - The parser's current position, right after the 4 hex digits.
/// * `pending_high_surrogate` - The optional high surrogate from a previous escape.
/// * `source` - A `DataSource` implementation to provide the hex digit slice.
///
/// # Returns
/// A tuple containing:
/// - Optional UTF-8 byte slice (None if this is a high surrogate waiting for low surrogate)
/// - The start position of the escape sequence (`\uXXXX`)
pub(crate) fn process_unicode_escape_sequence<'a, F>(
/// - Optional UTF-8 byte array and its length.
/// - The start position of the escape sequence (`\uXXXX`).
/// - The new pending high surrogate value, if any.
pub(crate) fn process_unicode_escape_sequence<'input, 'scratch, D>(
current_pos: usize,
unicode_escape_collector: &mut UnicodeEscapeCollector,
mut hex_slice_provider: F,
) -> Result<(Option<([u8; 4], usize)>, usize), ParseError>
pending_high_surrogate: Option<u32>,
source: &'input D,
) -> Result<UnicodeEscapeResult, ParseError>
where
F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>,
D: ?Sized + crate::shared::DataSource<'input, 'scratch>,
{
let (hex_start, hex_end, escape_start_pos) = ContentRange::unicode_escape_bounds(current_pos);

// Extract the 4 hex digits from the buffer using the provider
let hex_slice = hex_slice_provider(hex_start, hex_end)?;
// Extract the 4 hex digits from the buffer using the DataSource
let hex_slice = source.get_borrowed_slice(hex_start, hex_end)?;

if hex_slice.len() != 4 {
return Err(UnexpectedState::InvalidUnicodeEscape.into());
}

// Create a temporary collector to process the hex digits
let mut temp_collector = UnicodeEscapeCollector::new();
temp_collector.set_pending_high_surrogate(pending_high_surrogate);

// Feed hex digits to the shared collector
for &hex_digit in hex_slice {
unicode_escape_collector.add_hex_digit(hex_digit)?;
temp_collector.add_hex_digit(hex_digit)?;
}

// Check if we had a pending high surrogate before processing
let had_pending_high_surrogate = unicode_escape_collector.has_pending_high_surrogate();
let had_pending_high_surrogate = temp_collector.has_pending_high_surrogate();

// Create a local buffer for the UTF-8 result
let mut utf8_buf = [0u8; 4];

// Process the complete sequence to UTF-8 with surrogate support
let (utf8_bytes_opt, _surrogate_state_changed) =
unicode_escape_collector.process_to_utf8(&mut utf8_buf)?;
temp_collector.process_to_utf8(&mut utf8_buf)?;

let new_pending_high_surrogate = temp_collector.get_pending_high_surrogate();

// If we have a result, copy it to a new array to return by value
let result_by_value = utf8_bytes_opt.map(|bytes| {
Expand All @@ -708,5 +731,9 @@ where
escape_start_pos
};

Ok((result_by_value, final_escape_start_pos))
Ok((
result_by_value,
final_escape_start_pos,
new_pending_high_surrogate,
))
}
40 changes: 31 additions & 9 deletions picojson/src/event_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,25 @@ impl<T: ujson::BitBucket, C: ujson::DepthCounter> ParserCore<T, C> {
/// This supports StreamParser-specific byte accumulation when no events are generated.
/// SliceParser passes a no-op closure for byte_accumulator.
pub fn next_event_impl<'a, P, F>(
&mut self,
provider: &'a mut P,
escape_timing: EscapeTiming,
byte_accumulator: F,
) -> Result<Event<'a, 'a>, ParseError>
where
P: ContentExtractor,
F: FnMut(&mut P, u8) -> Result<(), ParseError>,
{
self.next_event_impl_with_flags(provider, escape_timing, byte_accumulator, false)
}

/// Extended version with flags for specialized behavior
pub fn next_event_impl_with_flags<'a, P, F>(
&mut self,
provider: &'a mut P,
escape_timing: EscapeTiming,
mut byte_accumulator: F,
always_accumulate_during_escapes: bool,
) -> Result<Event<'a, 'a>, ParseError>
where
P: ContentExtractor,
Expand All @@ -58,20 +73,27 @@ impl<T: ujson::BitBucket, C: ujson::DepthCounter> ParserCore<T, C> {
.map_err(ParseError::TokenizerError)?;
}

// Call byte accumulator if no events were generated AND we are not in an escape sequence
if !have_events(&self.parser_state.evts) && !self.in_escape_sequence {
let should_accumulate = if always_accumulate_during_escapes {
if self.in_escape_sequence {
true // Always accumulate during escape sequences
} else {
!have_events(&self.parser_state.evts) // Normal behavior outside escapes
}
} else {
!have_events(&self.parser_state.evts) && !self.in_escape_sequence
};

if should_accumulate {
byte_accumulator(provider, byte)?;
}
} else {
// Handle end of stream
{
clear_events(&mut self.parser_state.evts);
let mut callback = create_tokenizer_callback(&mut self.parser_state.evts);
self.tokenizer
.finish(&mut callback)
.map_err(ParseError::TokenizerError)?;
}
let mut finish_callback =
create_tokenizer_callback(&mut self.parser_state.evts);
let _bytes_processed = self.tokenizer.finish(&mut finish_callback)?;
} // Drop the callback to release the borrow

// If finish() generated events, process them. Otherwise, return EndDocument.
if !have_events(&self.parser_state.evts) {
return Ok(Event::EndDocument);
}
Expand Down
12 changes: 10 additions & 2 deletions picojson/src/parse_error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ use crate::ujson;
pub enum ParseError {
/// An error bubbled up from the underlying tokenizer.
TokenizerError(ujson::Error),
/// The provided scratch buffer was not large enough for an operation.
/// The scratch buffer is full.
ScratchBufferFull,
/// A string slice was not valid UTF-8.
/// A UTF-8 error occurred.
InvalidUtf8(core::str::Utf8Error),
/// The input buffer is full.
InputBufferFull,
/// A number string could not be parsed.
InvalidNumber,
/// The parser entered an unexpected internal state.
Expand Down Expand Up @@ -73,6 +75,12 @@ impl From<UnexpectedState> for ParseError {
}
}

impl From<ujson::Error> for ParseError {
fn from(err: ujson::Error) -> Self {
ParseError::TokenizerError(err)
}
}

impl core::fmt::Display for ParseError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
Expand Down
100 changes: 99 additions & 1 deletion picojson/src/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,11 @@ impl ContentRange {
current_pos: usize,
) -> (usize, usize) {
let content_end = current_pos.saturating_sub(1); // Back up to exclude closing quote
(content_start, content_end)
if content_start > content_end {
(content_start, content_start)
} else {
(content_start, content_end)
}
}

/// Calculate Unicode escape sequence boundaries
Expand Down Expand Up @@ -171,6 +175,100 @@ impl ContentRange {
}
}

/// A trait that abstracts the source of JSON data for content extraction.
///
/// This trait provides a unified interface for accessing both borrowed content from
/// the original input data and unescaped content from temporary scratch buffers.
/// It enables consistent content extraction patterns across different parser types.
///
/// # Generic Parameters
///
/// * `'input` - Lifetime for the input data being parsed
/// * `'scratch` - Lifetime for the scratch buffer used for temporary storage
pub trait DataSource<'input, 'scratch> {
/// Returns a slice of the raw, unprocessed input data from a specific range.
/// Used for zero-copy extraction of content that contains no escape sequences.
///
/// # Arguments
/// * `start` - Start position in the input data
/// * `end` - End position in the input data (exclusive)
///
/// # Returns
/// A slice of the input data with lifetime `'input`
fn get_borrowed_slice(
&'input self,
start: usize,
end: usize,
) -> Result<&'input [u8], ParseError>;

/// Returns the full slice of the processed, unescaped content from the scratch buffer.
/// Used when escape sequences have been processed and content written to temporary buffer.
///
/// # Returns
/// A slice of unescaped content with lifetime `'scratch`
fn get_unescaped_slice(&'scratch self) -> Result<&'scratch [u8], ParseError>;

/// Check if unescaped content is available in the scratch buffer.
///
/// # Returns
/// `true` if unescaped content exists and should be accessed via `get_unescaped_slice()`,
/// `false` if content should be accessed via `get_borrowed_slice()`
fn has_unescaped_content(&self) -> bool;
}

/// Raw content piece from either input buffer or scratch buffer.
/// This enum cleanly separates the two different content sources without
/// coupling the DataSource trait to high-level JSON types.
#[derive(Debug, PartialEq)]
pub enum ContentPiece<'input, 'scratch> {
/// Content borrowed directly from the input buffer (zero-copy)
Input(&'input [u8]),
/// Content processed and stored in the scratch buffer (unescaped)
Scratch(&'scratch [u8]),
}

impl<'input, 'scratch> ContentPiece<'input, 'scratch>
where
'input: 'scratch,
{
/// Convert the content piece to a String enum
pub fn into_string(self) -> Result<String<'input, 'scratch>, ParseError> {
match self {
ContentPiece::Input(bytes) => {
let content_str = from_utf8(bytes)?;
Ok(String::Borrowed(content_str))
}
ContentPiece::Scratch(bytes) => {
let content_str = from_utf8(bytes)?;
Ok(String::Unescaped(content_str))
}
}
}
}

pub fn from_utf8(v: &[u8]) -> Result<&str, ParseError> {
core::str::from_utf8(v).map_err(Into::into)
}

/// A generic helper function that uses the DataSource trait to extract the correct
/// content piece (either borrowed or from scratch). This consolidates the core
/// extraction logic for all parsers.
pub fn get_content_piece<'input, 'scratch, D>(
source: &'input D,
start_pos: usize,
current_pos: usize,
) -> Result<ContentPiece<'input, 'scratch>, ParseError>
where
'input: 'scratch,
D: ?Sized + DataSource<'input, 'scratch>,
{
if source.has_unescaped_content() {
source.get_unescaped_slice().map(ContentPiece::Scratch)
} else {
let (content_start, content_end) =
ContentRange::string_content_bounds_from_content_start(start_pos, current_pos);
source
.get_borrowed_slice(content_start, content_end)
.map(ContentPiece::Input)
}
}
Loading
Loading